16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 21 #if defined(NLIB_SSE41) 23 #elif defined(NLIB_NEON) 67 #if !defined(_MSC_VER) && !defined(__vectorcall) 71 #if defined(NLIB_SIMD) 76 #if defined(_MSC_VER) && _MSC_VER < 1800 77 typedef const i128& i128arg;
79 typedef const i128 i128arg;
100 static i128 __vectorcall SetFull(i128arg dummy)
NLIB_NOEXCEPT;
102 static i128 __vectorcall LoadA16(
const void* p)
NLIB_NOEXCEPT;
103 static i128 __vectorcall LoadA8(
const void* p)
NLIB_NOEXCEPT;
104 static i128 __vectorcall LoadA4(
const void* p)
NLIB_NOEXCEPT;
105 static i128 __vectorcall LoadA2(
const void* p)
NLIB_NOEXCEPT;
106 static i128 __vectorcall LoadA1(
const void* p)
NLIB_NOEXCEPT;
107 static i128 __vectorcall LoadLoA8(
const void* p)
NLIB_NOEXCEPT;
108 static i128 __vectorcall LoadLoA4(
const void* p)
NLIB_NOEXCEPT;
109 static i128 __vectorcall LoadLoA2(
const void* p)
NLIB_NOEXCEPT;
110 static i128 __vectorcall LoadLoA1(
const void* p)
NLIB_NOEXCEPT;
111 static i128 __vectorcall LoadHiA8(
const void* p)
NLIB_NOEXCEPT;
112 static i128 __vectorcall LoadHiA4(
const void* p)
NLIB_NOEXCEPT;
113 static i128 __vectorcall LoadHiA2(
const void* p)
NLIB_NOEXCEPT;
114 static i128 __vectorcall LoadHiA1(
const void* p)
NLIB_NOEXCEPT;
116 #define NLIB_LOAD_REDIRECT(func) \ 117 static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \ 118 return func(reinterpret_cast<void*>(p)); \ 120 static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \ 121 return func(reinterpret_cast<void*>(p)); \ 123 NLIB_LOAD_REDIRECT(LoadA16)
124 NLIB_LOAD_REDIRECT(LoadA8)
125 NLIB_LOAD_REDIRECT(LoadA4)
126 NLIB_LOAD_REDIRECT(LoadA2)
127 NLIB_LOAD_REDIRECT(LoadA1)
128 NLIB_LOAD_REDIRECT(LoadLoA8)
129 NLIB_LOAD_REDIRECT(LoadLoA4)
130 NLIB_LOAD_REDIRECT(LoadLoA2)
131 NLIB_LOAD_REDIRECT(LoadLoA1)
132 NLIB_LOAD_REDIRECT(LoadHiA8)
133 NLIB_LOAD_REDIRECT(LoadHiA4)
134 NLIB_LOAD_REDIRECT(LoadHiA2)
135 NLIB_LOAD_REDIRECT(LoadHiA1)
136 #undef NLIB_LOAD_REDIRECT 138 static void __vectorcall StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT;
139 static void __vectorcall StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
140 static void __vectorcall StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
141 static void __vectorcall StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
142 static void __vectorcall StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
143 static void __vectorcall StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
144 static void __vectorcall StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
145 static void __vectorcall StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
146 static void __vectorcall StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
147 static void __vectorcall StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
148 static void __vectorcall StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
149 static void __vectorcall StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
150 static void __vectorcall StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
152 #define NLIB_STORE_REDIRECT(func) \ 153 static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \ 154 func(reinterpret_cast<void*>(p), value); \ 156 static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \ 157 func(reinterpret_cast<void*>(p), value); \ 159 NLIB_STORE_REDIRECT(StoreA16)
160 NLIB_STORE_REDIRECT(StoreA8)
161 NLIB_STORE_REDIRECT(StoreA4)
162 NLIB_STORE_REDIRECT(StoreA2)
163 NLIB_STORE_REDIRECT(StoreA1)
164 NLIB_STORE_REDIRECT(StoreLoA8)
165 NLIB_STORE_REDIRECT(StoreLoA4)
166 NLIB_STORE_REDIRECT(StoreLoA2)
167 NLIB_STORE_REDIRECT(StoreLoA1)
168 NLIB_STORE_REDIRECT(StoreHiA8)
169 NLIB_STORE_REDIRECT(StoreHiA4)
170 NLIB_STORE_REDIRECT(StoreHiA2)
171 NLIB_STORE_REDIRECT(StoreHiA1)
172 #undef NLIB_STORE_REDIRECT 178 static uint8_t __vectorcall GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT;
180 static uint16_t __vectorcall GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT;
182 static uint32_t __vectorcall GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT;
184 static uint64_t __vectorcall GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT;
186 static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v)
NLIB_NOEXCEPT;
188 static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT;
190 static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT;
192 static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT;
197 static i128 __vectorcall Add8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
198 static i128 __vectorcall Add16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
199 static i128 __vectorcall Add32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
200 static i128 __vectorcall Add64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
202 static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
203 static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
205 static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
206 static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
208 static i128 __vectorcall Sub8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
209 static i128 __vectorcall Sub16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
210 static i128 __vectorcall Sub32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
211 static i128 __vectorcall Sub64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
213 static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
214 static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
216 static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
217 static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
219 static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
220 static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
221 static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
227 static i128 __vectorcall Mult16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
228 static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
229 static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
230 static i128 __vectorcall Mult32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
231 static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
232 static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
234 static i128 __vectorcall NegateInt8(i128arg value)
NLIB_NOEXCEPT;
235 static i128 __vectorcall NegateInt16(i128arg value)
NLIB_NOEXCEPT;
236 static i128 __vectorcall NegateInt32(i128arg value)
NLIB_NOEXCEPT;
238 static i128 __vectorcall MaxInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
239 static i128 __vectorcall MaxInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
240 static i128 __vectorcall MaxInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
241 static i128 __vectorcall MaxUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
242 static i128 __vectorcall MaxUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
243 static i128 __vectorcall MaxUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
244 static i128 __vectorcall MinInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
245 static i128 __vectorcall MinInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
246 static i128 __vectorcall MinInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
247 static i128 __vectorcall MinUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
248 static i128 __vectorcall MinUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
249 static i128 __vectorcall MinUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
251 static i128 __vectorcall AbsInt8(i128arg value)
NLIB_NOEXCEPT;
252 static i128 __vectorcall AbsInt16(i128arg value)
NLIB_NOEXCEPT;
253 static i128 __vectorcall AbsInt32(i128arg value)
NLIB_NOEXCEPT;
254 static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
255 static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
256 static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
261 static i128 __vectorcall And(i128arg a, i128arg b)
NLIB_NOEXCEPT;
262 static i128 __vectorcall Or(i128arg a, i128arg b)
NLIB_NOEXCEPT;
263 static i128 __vectorcall Xor(i128arg a, i128arg b)
NLIB_NOEXCEPT;
265 static i128 __vectorcall AndNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
266 static i128 __vectorcall OrNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
267 static i128 __vectorcall Test8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
268 static i128 __vectorcall Test16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
269 static i128 __vectorcall Test32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
274 static i128 __vectorcall CmpEq8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
275 static i128 __vectorcall CmpEq16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
276 static i128 __vectorcall CmpEq32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
277 static i128 __vectorcall CmpEq64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
279 static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
280 static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
281 static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
282 static i128 __vectorcall CmpLtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
284 static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
285 static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
286 static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
287 static i128 __vectorcall CmpGtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
289 static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
290 static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
291 static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
292 static i128 __vectorcall CmpLtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
294 static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
295 static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
296 static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
297 static i128 __vectorcall CmpGtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
299 static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
300 static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
301 static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
302 static i128 __vectorcall CmpLeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
304 static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
305 static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
306 static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
307 static i128 __vectorcall CmpGeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
309 static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
310 static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
311 static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
312 static i128 __vectorcall CmpLeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
314 static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
315 static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
316 static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
317 static i128 __vectorcall CmpGeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
319 static i128 __vectorcall CmpEqZero8(i128arg value)
NLIB_NOEXCEPT;
320 static i128 __vectorcall CmpEqZero16(i128arg value)
NLIB_NOEXCEPT;
321 static i128 __vectorcall CmpEqZero32(i128arg value)
NLIB_NOEXCEPT;
322 static i128 __vectorcall CmpEqZero64(i128arg value)
NLIB_NOEXCEPT;
327 static i128 __vectorcall ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
328 static i128 __vectorcall ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
329 static i128 __vectorcall ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT;
331 static i128 __vectorcall ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
332 static i128 __vectorcall ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
333 static i128 __vectorcall ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT;
335 static i128 __vectorcall ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
336 static i128 __vectorcall ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
337 static i128 __vectorcall ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT;
339 static i128 __vectorcall ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
340 static i128 __vectorcall ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
346 static i128 __vectorcall ShiftLeftLogical8(i128arg value)
NLIB_NOEXCEPT;
348 static i128 __vectorcall ShiftRightLogical8(i128arg value)
NLIB_NOEXCEPT;
350 static i128 __vectorcall ShiftRightArithmetic8(i128arg value)
NLIB_NOEXCEPT;
353 static i128 __vectorcall ShiftLeftLogical16(i128arg value)
NLIB_NOEXCEPT;
355 static i128 __vectorcall ShiftRightLogical16(i128arg value)
NLIB_NOEXCEPT;
357 static i128 __vectorcall ShiftRightArithmetic16(i128arg value)
NLIB_NOEXCEPT;
360 static i128 __vectorcall ShiftLeftLogical32(i128arg value)
NLIB_NOEXCEPT;
362 static i128 __vectorcall ShiftRightLogical32(i128arg value)
NLIB_NOEXCEPT;
364 static i128 __vectorcall ShiftRightArithmetic32(i128arg value)
NLIB_NOEXCEPT;
367 static i128 __vectorcall ShiftLeftLogical64(i128arg value)
NLIB_NOEXCEPT;
369 static i128 __vectorcall ShiftRightLogical64(i128arg value)
NLIB_NOEXCEPT;
375 static i128 __vectorcall ByteShiftLeft(i128arg value)
NLIB_NOEXCEPT;
377 static i128 __vectorcall ByteShiftRight(i128arg value)
NLIB_NOEXCEPT;
379 static i128 __vectorcall ByteRotateRight(i128arg value)
NLIB_NOEXCEPT;
381 static i128 __vectorcall AlignR(i128arg a, i128arg b)
NLIB_NOEXCEPT;
386 static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
387 static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
388 static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
390 static i128 __vectorcall
391 ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
392 static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
393 static i128 __vectorcall
394 ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
395 static i128 __vectorcall
396 ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
398 static i128 __vectorcall ConvertFromInt8ToInt16Lo(i128arg value)
NLIB_NOEXCEPT;
399 static i128 __vectorcall ConvertFromInt8ToInt16Hi(i128arg value)
NLIB_NOEXCEPT;
400 static i128 __vectorcall ConvertFromInt16ToInt32Lo(i128arg value)
NLIB_NOEXCEPT;
401 static i128 __vectorcall ConvertFromInt16ToInt32Hi(i128arg value)
NLIB_NOEXCEPT;
402 static i128 __vectorcall ConvertFromInt32ToInt64Lo(i128arg value)
NLIB_NOEXCEPT;
403 static i128 __vectorcall ConvertFromInt32ToInt64Hi(i128arg value)
NLIB_NOEXCEPT;
404 static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value)
NLIB_NOEXCEPT;
405 static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value)
NLIB_NOEXCEPT;
406 static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value)
NLIB_NOEXCEPT;
407 static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value)
NLIB_NOEXCEPT;
408 static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value)
NLIB_NOEXCEPT;
409 static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value)
NLIB_NOEXCEPT;
411 static i128 __vectorcall Zip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
412 static i128 __vectorcall Zip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
413 static i128 __vectorcall Unzip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
414 static i128 __vectorcall Unzip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
415 static i128 __vectorcall Zip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
416 static i128 __vectorcall Zip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
417 static i128 __vectorcall Unzip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
418 static i128 __vectorcall Unzip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
419 static i128 __vectorcall Zip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
420 static i128 __vectorcall Zip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
421 static i128 __vectorcall Unzip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
422 static i128 __vectorcall Unzip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
424 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
425 int V8,
int V9,
int V10,
int V11,
int V12,
int V13,
int V14,
int V15>
426 static i128 __vectorcall Permute8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
427 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
428 static i128 __vectorcall Permute16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
429 template<
int V0,
int V1,
int V2,
int V3>
430 static i128 __vectorcall Permute32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
435 static i128 __vectorcall Reverse16(i128arg value)
NLIB_NOEXCEPT;
436 static i128 __vectorcall Reverse32(i128arg value)
NLIB_NOEXCEPT;
437 static i128 __vectorcall Reverse64(i128arg value)
NLIB_NOEXCEPT;
442 static int __vectorcall MoveMask8(i128arg value)
NLIB_NOEXCEPT;
443 static int __vectorcall MoveMask16(i128arg value)
NLIB_NOEXCEPT;
444 static int __vectorcall MoveMask32(i128arg value)
NLIB_NOEXCEPT;
448 static bool __vectorcall IsZero(i128arg value)
NLIB_NOEXCEPT;
449 static bool __vectorcall IsFull(i128arg value)
NLIB_NOEXCEPT;
450 static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT;
451 static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle)
NLIB_NOEXCEPT;
452 static int __vectorcall PopCntMask8(i128arg value)
NLIB_NOEXCEPT;
453 static int __vectorcall ClzMask8(i128arg value)
NLIB_NOEXCEPT;
454 static int __vectorcall CtzMask8(i128arg value)
NLIB_NOEXCEPT;
462 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 463 #define NLIB_M2(tp) inline tp __vectorcall 466 # undef vreinterpret_s8_s8 474 #define vreinterpretq_s8_s8(a) (a) 475 #define NLIB_OP1(intrin, tp, a) \ 476 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a))) 477 #define NLIB_OP2(intrin, tp, a, b) \ 478 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 479 vreinterpretq_##tp##_s8(b))) 480 #define NLIB_OP3(intrin, tp, a, b, c) \ 481 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 482 vreinterpretq_##tp##_s8(b), \ 483 vreinterpretq_##tp##_s8(c))) 484 #define NLIB_CMP(intrin, tp, a, b, utp) \ 485 vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 486 vreinterpretq_##tp##_s8(b))) 487 #define NLIB_SFT(intrin, tp, a, cnt, stp) \ 488 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt))) 489 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h)) 494 #if defined(NLIB_SSE41) 496 return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
497 #elif defined(NLIB_NEON) 498 return vdupq_n_s8(v);
504 #if defined(NLIB_SSE41) 505 return _mm_set1_epi16(v);
506 #elif defined(NLIB_NEON) 507 return vreinterpretq_s8_s16(vdupq_n_s16(v));
513 #if defined(NLIB_SSE41) 514 return _mm_set1_epi32(v);
515 #elif defined(NLIB_NEON) 516 return vreinterpretq_s8_s32(vdupq_n_s32(v));
522 #if defined(NLIB_SSE41) 528 return I128::LoadA16(tmp);
530 return _mm_set1_epi64x(v);
532 #elif defined(NLIB_NEON) 533 return vreinterpretq_s8_s64(vdupq_n_s64(v));
539 #if defined(NLIB_SSE41) 541 return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
542 #elif defined(NLIB_NEON) 543 return vreinterpretq_s8_u8(vdupq_n_u8(v));
549 #if defined(NLIB_SSE41) 550 return _mm_set1_epi16(static_cast<int16_t>(v));
551 #elif defined(NLIB_NEON) 552 return vreinterpretq_s8_u16(vdupq_n_u16(v));
558 #if defined(NLIB_SSE41) 559 return _mm_set1_epi32(static_cast<int32_t>(v));
560 #elif defined(NLIB_NEON) 561 return vreinterpretq_s8_u32(vdupq_n_u32(v));
567 #if defined(NLIB_SSE41) 570 return I128::LoadA16(tmp);
572 return _mm_set1_epi64x(static_cast<int64_t>(v));
574 #elif defined(NLIB_NEON) 575 return vreinterpretq_s8_u64(vdupq_n_u64(v));
579 #if defined(NLIB_SSE41) 584 return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
586 #elif defined(NLIB_NEON) 592 uint32x4_t v = vreinterpretq_u32_s8(value);
593 return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
598 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
599 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
603 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
604 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
608 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
609 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
613 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
614 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
619 #if defined(NLIB_SSE41) 625 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
626 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
628 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
630 #elif defined(NLIB_NEON) 634 uint16x8_t v = vreinterpretq_u16_s8(value);
635 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
637 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
638 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
645 uint16x8_t v = vreinterpretq_u16_s8(value);
646 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
648 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
649 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
656 uint16x8_t v = vreinterpretq_u16_s8(value);
657 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
659 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
660 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
667 uint16x8_t v = vreinterpretq_u16_s8(value);
668 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
670 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
671 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
678 uint16x8_t v = vreinterpretq_u16_s8(value);
679 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
681 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
682 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
689 uint16x8_t v = vreinterpretq_u16_s8(value);
690 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
692 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
693 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
700 uint16x8_t v = vreinterpretq_u16_s8(value);
701 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
703 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
704 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
711 uint16x8_t v = vreinterpretq_u16_s8(value);
712 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
714 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
715 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
720 #if defined(NLIB_SSE41) 726 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
728 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
730 #elif defined(NLIB_NEON) 732 template <
size_t N,
bool IsLower>
733 struct SetValue8Helper {
735 return vdupq_lane_s8(vget_low_s8(value), N);
740 struct SetValue8Helper<N, false> {
742 return vdupq_lane_s8(vget_high_s8(value), N - 8);
752 return vdupq_laneq_s8(value, N);
754 return detail::SetValue8Helper<N, (N < 8)>()(value);
761 #if defined(NLIB_SSE41) 762 return _mm_setzero_si128();
763 #elif defined(NLIB_NEON) 764 return vdupq_n_s8(0);
769 NLIB_M(i128) I128::SetFull(i128arg dummy)
NLIB_NOEXCEPT {
return I128::CmpEq8(dummy, dummy); }
773 #if defined(NLIB_SSE41) 774 return _mm_load_si128(static_cast<const __m128i*>(p));
775 #elif defined(NLIB_NEON) 776 uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
777 return vreinterpretq_s8_u64(tmp);
783 #if defined(NLIB_SSE41) 784 return _mm_loadu_si128(static_cast<const __m128i*>(p));
785 #elif defined(NLIB_NEON) 786 uint64x2_t tmp = vld1q_u64(static_cast<const uint64_t*>(p));
787 return vreinterpretq_s8_u64(tmp);
793 #if defined(NLIB_SSE41) 794 return _mm_loadu_si128(static_cast<const __m128i*>(p));
795 #elif defined(NLIB_NEON) 796 uint32x4_t tmp = vld1q_u32(static_cast<const uint32_t*>(p));
797 return vreinterpretq_s8_u32(tmp);
803 #if defined(NLIB_SSE41) 804 return _mm_loadu_si128(static_cast<const __m128i*>(p));
805 #elif defined(NLIB_NEON) 806 uint16x8_t tmp = vld1q_u16(static_cast<const uint16_t*>(p));
807 return vreinterpretq_s8_u16(tmp);
813 #if defined(NLIB_SSE41) 814 return _mm_loadu_si128(static_cast<const __m128i*>(p));
815 #elif defined(NLIB_NEON) 816 return vld1q_s8(static_cast<const int8_t*>(p));
821 #if defined(NLIB_SSE41) 822 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
823 #elif defined(NLIB_NEON) 824 int8x8_t lo = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
825 return vcombine_s8(lo, vdup_n_s8(0));
830 #if defined(NLIB_SSE41) 831 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
832 #elif defined(NLIB_NEON) 833 int8x8_t lo = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
834 return vcombine_s8(lo, vdup_n_s8(0));
839 #if defined(NLIB_SSE41) 840 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
841 #elif defined(NLIB_NEON) 842 int8x8_t lo = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
843 return vcombine_s8(lo, vdup_n_s8(0));
848 #if defined(NLIB_SSE41) 849 return _mm_loadl_epi64(static_cast<const __m128i*>(p));
850 #elif defined(NLIB_NEON) 851 int8x8_t lo = vld1_s8(static_cast<const int8_t*>(p));
852 return vcombine_s8(lo, vdup_n_s8(0));
857 #if defined(NLIB_SSE41) 858 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
859 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
860 #elif defined(NLIB_NEON) 861 int8x8_t hi = vreinterpret_s8_u64(vld1_u64(static_cast<const uint64_t*>(p)));
862 return vcombine_s8(vdup_n_s8(0), hi);
867 #if defined(NLIB_SSE41) 868 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
869 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
870 #elif defined(NLIB_NEON) 871 int8x8_t hi = vreinterpret_s8_u32(vld1_u32(static_cast<const uint32_t*>(p)));
872 return vcombine_s8(vdup_n_s8(0), hi);
877 #if defined(NLIB_SSE41) 878 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
879 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
880 #elif defined(NLIB_NEON) 881 int8x8_t hi = vreinterpret_s8_u16(vld1_u16(static_cast<const uint16_t*>(p)));
882 return vcombine_s8(vdup_n_s8(0), hi);
887 #if defined(NLIB_SSE41) 888 __m128i tmp = _mm_loadl_epi64(static_cast<const __m128i*>(p));
889 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
890 #elif defined(NLIB_NEON) 891 int8x8_t hi = vld1_s8(static_cast<const int8_t*>(p));
892 return vcombine_s8(vdup_n_s8(0), hi);
897 NLIB_M(
void) I128::StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT {
898 #if defined(NLIB_SSE41) 899 _mm_store_si128(static_cast<i128*>(p), value);
900 #elif defined(NLIB_NEON) 901 vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
906 NLIB_M(
void) I128::StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
907 #if defined(NLIB_SSE41) 908 _mm_storeu_si128(static_cast<i128*>(p), value);
909 #elif defined(NLIB_NEON) 910 vst1q_u64(static_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
915 NLIB_M(
void) I128::StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
916 #if defined(NLIB_SSE41) 917 _mm_storeu_si128(static_cast<i128*>(p), value);
918 #elif defined(NLIB_NEON) 919 vst1q_u32(static_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
924 NLIB_M(
void) I128::StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
925 #if defined(NLIB_SSE41) 926 _mm_storeu_si128(static_cast<i128*>(p), value);
927 #elif defined(NLIB_NEON) 928 vst1q_u16(static_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
933 NLIB_M(
void) I128::StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
934 #if defined(NLIB_SSE41) 935 _mm_storeu_si128(static_cast<i128*>(p), value);
936 #elif defined(NLIB_NEON) 937 vst1q_s8(static_cast<int8_t*>(p), value);
941 NLIB_M(
void) I128::StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
942 #if defined(NLIB_SSE41) 943 _mm_storel_epi64(static_cast<i128*>(p), value);
944 #elif defined(NLIB_NEON) 945 uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
946 vst1_u64(static_cast<uint64_t*>(p), x);
950 NLIB_M(
void) I128::StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
951 #if defined(NLIB_SSE41) 952 _mm_storel_epi64(static_cast<i128*>(p), value);
953 #elif defined(NLIB_NEON) 954 uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
955 vst1_u32(static_cast<uint32_t*>(p), x);
959 NLIB_M(
void) I128::StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
960 #if defined(NLIB_SSE41) 961 _mm_storel_epi64(static_cast<i128*>(p), value);
962 #elif defined(NLIB_NEON) 963 uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
964 vst1_u16(static_cast<uint16_t*>(p), x);
968 NLIB_M(
void) I128::StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
969 #if defined(NLIB_SSE41) 970 _mm_storel_epi64(static_cast<i128*>(p), value);
971 #elif defined(NLIB_NEON) 972 int8x8_t x = vget_low_s8(value);
973 vst1_s8(static_cast<int8_t*>(p), x);
977 NLIB_M(
void) I128::StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
978 #if defined(NLIB_SSE41) 979 _mm_storel_epi64(static_cast<i128*>(p),
980 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
981 #elif defined(NLIB_NEON) 982 uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
983 vst1_u64(static_cast<uint64_t*>(p), x);
987 NLIB_M(
void) I128::StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
988 #if defined(NLIB_SSE41) 989 _mm_storel_epi64(static_cast<i128*>(p),
990 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
991 #elif defined(NLIB_NEON) 992 uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
993 vst1_u32(static_cast<uint32_t*>(p), x);
997 NLIB_M(
void) I128::StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
998 #if defined(NLIB_SSE41) 999 _mm_storel_epi64(static_cast<i128*>(p),
1000 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1001 #elif defined(NLIB_NEON) 1002 uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
1003 vst1_u16(static_cast<uint16_t*>(p), x);
1007 NLIB_M(
void) I128::StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
1008 #if defined(NLIB_SSE41) 1009 _mm_storel_epi64(static_cast<i128*>(p),
1010 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1011 #elif defined(NLIB_NEON) 1012 int8x8_t x = vget_high_s8(value);
1013 vst1_s8(static_cast<int8_t*>(p), x);
1019 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT {
1021 #if defined(NLIB_SSE41) 1022 return static_cast<uint8_t
>(_mm_extract_epi8(value, N));
1023 #elif defined(NLIB_NEON) 1024 return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1030 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT {
1032 #if defined(NLIB_SSE41) 1033 return static_cast<uint16_t
>(_mm_extract_epi16(value, N));
1034 #elif defined(NLIB_NEON) 1035 return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1041 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT {
1043 #if defined(NLIB_SSE41) 1044 return static_cast<uint32_t
>(_mm_extract_epi32(value, N));
1045 #elif defined(NLIB_NEON) 1046 return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1052 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT {
1054 #if defined(NLIB_SSE41) 1056 return static_cast<uint64_t
>(_mm_extract_epi64(value, N));
1061 #elif defined(NLIB_NEON) 1062 return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1066 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT) 1068 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value)
NLIB_NOEXCEPT {
1070 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1074 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value)
NLIB_NOEXCEPT {
1076 i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1077 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1084 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v)
NLIB_NOEXCEPT {
1086 #if defined(NLIB_SSE41) 1087 return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1088 #elif defined(NLIB_NEON) 1089 return __builtin_constant_p(v) ?
1106 N == 15 ? 31 : 15>(value, vreinterpretq_s8_u8(vdupq_n_u8(v))) :
1107 vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1113 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT {
1115 #if defined(NLIB_SSE41) 1116 return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1117 #elif defined(NLIB_NEON) 1118 return __builtin_constant_p(v) ?
1127 N == 7 ? 15 : 7>(value, vreinterpretq_s8_u16(vdupq_n_u16(v))) :
1128 vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1134 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT {
1136 #if defined(NLIB_SSE41) 1137 return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1138 #elif defined(NLIB_NEON) 1139 return __builtin_constant_p(v) ?
1140 I128::Permute32<N == 0 ? 4 : 0,
1143 N == 3 ? 7 : 3>(value, vreinterpretq_s8_u32(vdupq_n_u32(v))) :
1144 vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1150 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT {
1152 #if defined(NLIB_SSE41) 1154 return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1160 tmp.i64 =
static_cast<int64_t
>(v);
1162 rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1163 return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1165 #elif defined(NLIB_NEON) 1166 return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1171 NLIB_M(i128) I128::Add8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1172 #if defined(NLIB_SSE41) 1173 return _mm_add_epi8(a, b);
1174 #elif defined(NLIB_NEON) 1175 return vaddq_s8(a, b);
1180 NLIB_M(i128) I128::Add16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1181 #if defined(NLIB_SSE41) 1182 return _mm_add_epi16(a, b);
1183 #elif defined(NLIB_NEON) 1184 return NLIB_OP2(vaddq, s16, a, b);
1189 NLIB_M(i128) I128::Add32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1190 #if defined(NLIB_SSE41) 1191 return _mm_add_epi32(a, b);
1192 #elif defined(NLIB_NEON) 1193 return NLIB_OP2(vaddq, s32, a, b);
1198 NLIB_M(i128) I128::Add64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1199 #if defined(NLIB_SSE41) 1200 return _mm_add_epi64(a, b);
1201 #elif defined(NLIB_NEON) 1202 return NLIB_OP2(vaddq, s64, a, b);
1207 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1208 #if defined(NLIB_SSE41) 1209 return _mm_adds_epi8(a, b);
1210 #elif defined(NLIB_NEON) 1211 return vqaddq_s8(a, b);
1216 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1217 #if defined(NLIB_SSE41) 1218 return _mm_adds_epi16(a, b);
1219 #elif defined(NLIB_NEON) 1220 return NLIB_OP2(vqaddq, s16, a, b);
1225 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1226 #if defined(NLIB_SSE41) 1227 return _mm_adds_epu8(a, b);
1228 #elif defined(NLIB_NEON) 1229 return NLIB_OP2(vqaddq, u8, a, b);
1234 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1235 #if defined(NLIB_SSE41) 1236 return _mm_adds_epu16(a, b);
1237 #elif defined(NLIB_NEON) 1238 return NLIB_OP2(vqaddq, u16, a, b);
1243 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1244 #if defined(NLIB_SSE41) 1245 return _mm_sub_epi8(a, b);
1246 #elif defined(NLIB_NEON) 1247 return vsubq_s8(a, b);
1252 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1253 #if defined(NLIB_SSE41) 1254 return _mm_sub_epi16(a, b);
1255 #elif defined(NLIB_NEON) 1256 return NLIB_OP2(vsubq, s16, a, b);
1261 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1262 #if defined(NLIB_SSE41) 1263 return _mm_sub_epi32(a, b);
1264 #elif defined(NLIB_NEON) 1265 return NLIB_OP2(vsubq, s32, a, b);
1270 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1271 #if defined(NLIB_SSE41) 1272 return _mm_sub_epi64(a, b);
1273 #elif defined(NLIB_NEON) 1274 return NLIB_OP2(vsubq, s64, a, b);
1279 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1280 #if defined(NLIB_SSE41) 1281 return _mm_subs_epi8(a, b);
1282 #elif defined(NLIB_NEON) 1283 return NLIB_OP2(vqsubq, s8, a, b);
1288 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1289 #if defined(NLIB_SSE41) 1290 return _mm_subs_epi16(a, b);
1291 #elif defined(NLIB_NEON) 1292 return NLIB_OP2(vqsubq, s16, a, b);
1297 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1298 #if defined(NLIB_SSE41) 1299 return _mm_subs_epu8(a, b);
1300 #elif defined(NLIB_NEON) 1301 return NLIB_OP2(vqsubq, u8, a, b);
1306 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1307 #if defined(NLIB_SSE41) 1308 return _mm_subs_epu16(a, b);
1309 #elif defined(NLIB_NEON) 1310 return NLIB_OP2(vqsubq, u16, a, b);
1315 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1316 #if defined(NLIB_SSE41) 1317 __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1318 __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1319 return I128::NarrowFrom16To8(ax, bx);
1320 #elif defined(NLIB_NEON) 1322 return vpaddq_s8(a, b);
1324 int8x8_t al = vget_low_s8(a);
1325 int8x8_t ah = vget_high_s8(a);
1326 int8x8_t rl = vpadd_s8(al, ah);
1327 int8x8_t bl = vget_low_s8(b);
1328 int8x8_t bh = vget_high_s8(b);
1329 int8x8_t rh = vpadd_s8(bl, bh);
1330 return vcombine_s8(rl, rh);
1336 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1337 #if defined(NLIB_SSE41) 1338 return _mm_hadd_epi16(a, b);
1339 #elif defined(NLIB_NEON) 1341 return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1343 int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1344 int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1345 int16x4_t rl = vpadd_s16(al, ah);
1346 int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1347 int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1348 int16x4_t rh = vpadd_s16(bl, bh);
1349 return NLIB_CMB(s16, rl, rh);
1355 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1356 #if defined(NLIB_SSE41) 1357 return _mm_hadd_epi32(a, b);
1358 #elif defined(NLIB_NEON) 1360 return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1362 int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1363 int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1364 int32x2_t rl = vpadd_s32(al, ah);
1365 int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1366 int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1367 int32x2_t rh = vpadd_s32(bl, bh);
1368 return NLIB_CMB(s32, rl, rh);
1374 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1375 #if defined(NLIB_SSE41) 1376 return _mm_mullo_epi16(a, b);
1377 #elif defined(NLIB_NEON) 1378 return NLIB_OP2(vmulq, s16, a, b);
1383 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1384 #if defined(NLIB_SSE41) 1385 return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1386 #elif defined(NLIB_NEON) 1387 return NLIB_OP3(vmlaq, s16, c, a, b);
1392 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1393 #if defined(NLIB_SSE41) 1394 return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1395 #elif defined(NLIB_NEON) 1396 return NLIB_OP3(vmlsq, s16, c, a, b);
1401 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1402 #if defined(NLIB_SSE41) 1403 return _mm_mullo_epi32(a, b);
1404 #elif defined(NLIB_NEON) 1405 return NLIB_OP2(vmulq, s32, a, b);
1410 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1411 #if defined(NLIB_SSE41) 1412 return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1413 #elif defined(NLIB_NEON) 1414 return NLIB_OP3(vmlaq, s32, c, a, b);
1419 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1420 #if defined(NLIB_SSE41) 1421 return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1422 #elif defined(NLIB_NEON) 1423 return NLIB_OP3(vmlsq, s32, c, a, b);
1428 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1429 #if defined(NLIB_SSE41) 1430 return _mm_max_epi8(a, b);
1431 #elif defined(NLIB_NEON) 1432 return NLIB_OP2(vmaxq, s8, a, b);
1437 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1438 #if defined(NLIB_SSE41) 1439 return _mm_max_epi16(a, b);
1440 #elif defined(NLIB_NEON) 1441 return NLIB_OP2(vmaxq, s16, a, b);
1446 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1447 #if defined(NLIB_SSE41) 1448 return _mm_max_epi32(a, b);
1449 #elif defined(NLIB_NEON) 1450 return NLIB_OP2(vmaxq, s32, a, b);
1455 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1456 #if defined(NLIB_SSE41) 1457 return _mm_max_epu8(a, b);
1458 #elif defined(NLIB_NEON) 1459 return NLIB_OP2(vmaxq, u8, a, b);
1464 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1465 #if defined(NLIB_SSE41) 1466 return _mm_max_epu16(a, b);
1467 #elif defined(NLIB_NEON) 1468 return NLIB_OP2(vmaxq, u16, a, b);
1473 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1474 #if defined(NLIB_SSE41) 1475 return _mm_max_epu32(a, b);
1476 #elif defined(NLIB_NEON) 1477 return NLIB_OP2(vmaxq, u32, a, b);
1482 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1483 #if defined(NLIB_SSE41) 1484 return _mm_min_epi8(a, b);
1485 #elif defined(NLIB_NEON) 1486 return NLIB_OP2(vminq, s8, a, b);
1491 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1492 #if defined(NLIB_SSE41) 1493 return _mm_min_epi16(a, b);
1494 #elif defined(NLIB_NEON) 1495 return NLIB_OP2(vminq, s16, a, b);
1500 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1501 #if defined(NLIB_SSE41) 1502 return _mm_min_epi32(a, b);
1503 #elif defined(NLIB_NEON) 1504 return NLIB_OP2(vminq, s32, a, b);
1509 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1510 #if defined(NLIB_SSE41) 1511 return _mm_min_epu8(a, b);
1512 #elif defined(NLIB_NEON) 1513 return NLIB_OP2(vminq, u8, a, b);
1518 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1519 #if defined(NLIB_SSE41) 1520 return _mm_min_epu16(a, b);
1521 #elif defined(NLIB_NEON) 1522 return NLIB_OP2(vminq, u16, a, b);
1527 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1528 #if defined(NLIB_SSE41) 1529 return _mm_min_epu32(a, b);
1530 #elif defined(NLIB_NEON) 1531 return NLIB_OP2(vminq, u32, a, b);
1537 #if defined(NLIB_SSE41) 1538 return _mm_abs_epi8(value);
1539 #elif defined(NLIB_NEON) 1540 return NLIB_OP1(vabsq, s8, value);
1546 #if defined(NLIB_SSE41) 1547 return _mm_abs_epi16(value);
1548 #elif defined(NLIB_NEON) 1549 return NLIB_OP1(vabsq, s16, value);
1555 #if defined(NLIB_SSE41) 1556 return _mm_abs_epi32(value);
1557 #elif defined(NLIB_NEON) 1558 return NLIB_OP1(vabsq, s32, value);
1563 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1564 #if defined(NLIB_SSE41) 1565 return _mm_abs_epi8(_mm_sub_epi8(a, b));
1566 #elif defined(NLIB_NEON) 1567 return NLIB_OP2(vabdq, s8, a, b);
1572 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1573 #if defined(NLIB_SSE41) 1574 return _mm_abs_epi16(_mm_sub_epi16(a, b));
1575 #elif defined(NLIB_NEON) 1576 return NLIB_OP2(vabdq, s16, a, b);
1581 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1582 #if defined(NLIB_SSE41) 1583 return _mm_abs_epi32(_mm_sub_epi32(a, b));
1584 #elif defined(NLIB_NEON) 1585 return NLIB_OP2(vabdq, s32, a, b);
1590 NLIB_M(i128) I128::NegateInt8(i128arg value)
NLIB_NOEXCEPT {
1591 #if defined(NLIB_SSE41) 1592 return _mm_sub_epi8(_mm_setzero_si128(), value);
1593 #elif defined(NLIB_NEON) 1594 return NLIB_OP1(vnegq, s8, value);
1599 NLIB_M(i128) I128::NegateInt16(i128arg value)
NLIB_NOEXCEPT {
1600 #if defined(NLIB_SSE41) 1601 return _mm_sub_epi16(_mm_setzero_si128(), value);
1602 #elif defined(NLIB_NEON) 1603 return NLIB_OP1(vnegq, s16, value);
1608 NLIB_M(i128) I128::NegateInt32(i128arg value)
NLIB_NOEXCEPT {
1609 #if defined(NLIB_SSE41) 1610 return _mm_sub_epi32(_mm_setzero_si128(), value);
1611 #elif defined(NLIB_NEON) 1612 return NLIB_OP1(vnegq, s32, value);
1617 NLIB_M(i128) I128::And(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1618 #if defined(NLIB_SSE41) 1619 return _mm_and_si128(a, b);
1620 #elif defined(NLIB_NEON) 1621 return NLIB_OP2(vandq, s8, a, b);
1627 #if defined(NLIB_SSE41) 1628 return _mm_or_si128(a, b);
1629 #elif defined(NLIB_NEON) 1630 return NLIB_OP2(vorrq, s8, a, b);
1635 NLIB_M(i128) I128::Xor(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1636 #if defined(NLIB_SSE41) 1637 return _mm_xor_si128(a, b);
1638 #elif defined(NLIB_NEON) 1639 return NLIB_OP2(veorq, s8, a, b);
1645 #if defined(NLIB_SSE41) 1646 return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1647 #elif defined(NLIB_NEON) 1648 return NLIB_OP1(vmvnq, s8, a);
1653 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1654 #if defined(NLIB_SSE41) 1655 return _mm_andnot_si128(a, b);
1656 #elif defined(NLIB_NEON) 1657 return NLIB_OP2(vbicq, s8, b, a);
1662 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1663 #if defined(NLIB_SSE41) 1664 __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1665 return _mm_or_si128(not_a, b);
1666 #elif defined(NLIB_NEON) 1667 return NLIB_OP2(vornq, s8, b, a);
1671 NLIB_M(i128) I128::Test8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1672 #if defined(NLIB_NEON) 1673 return vtstq_s8(a, b);
1675 return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1679 NLIB_M(i128) I128::Test16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1680 #if defined(NLIB_NEON) 1681 return NLIB_OP2(vtstq, s16, a, b);
1683 return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1687 NLIB_M(i128) I128::Test32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1688 #if defined(NLIB_NEON) 1689 return NLIB_OP2(vtstq, s32, a, b);
1691 return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1696 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1697 #if defined(NLIB_SSE41) 1698 return _mm_cmpeq_epi8(a, b);
1699 #elif defined(NLIB_NEON) 1700 return NLIB_CMP(vceqq, s8, a, b, u8);
1705 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1706 #if defined(NLIB_SSE41) 1707 return _mm_cmpeq_epi16(a, b);
1708 #elif defined(NLIB_NEON) 1709 return NLIB_CMP(vceqq, s16, a, b, u16);
1714 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1715 #if defined(NLIB_SSE41) 1716 return _mm_cmpeq_epi32(a, b);
1717 #elif defined(NLIB_NEON) 1718 return NLIB_CMP(vceqq, s32, a, b, u32);
1723 NLIB_M(i128) I128::CmpEq64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1724 #if defined(NLIB_SSE41) 1725 return _mm_cmpeq_epi64(a, b);
1726 #elif defined(NLIB_NEON) 1728 return NLIB_CMP(vceqq, s64, a, b, u64);
1730 uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1731 uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1732 uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1733 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1734 return vreinterpretq_s8_s64(result);
1740 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1741 #if defined(NLIB_SSE41) 1742 return _mm_cmplt_epi8(a, b);
1743 #elif defined(NLIB_NEON) 1744 return NLIB_CMP(vcltq, s8, a, b, u8);
1749 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1750 #if defined(NLIB_SSE41) 1751 return _mm_cmplt_epi16(a, b);
1752 #elif defined(NLIB_NEON) 1753 return NLIB_CMP(vcltq, s16, a, b, u16);
1758 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1759 #if defined(NLIB_SSE41) 1760 return _mm_cmplt_epi32(a, b);
1761 #elif defined(NLIB_NEON) 1762 return NLIB_CMP(vcltq, s32, a, b, u32);
1767 NLIB_M(i128) I128::CmpLtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1768 #if defined(NLIB_SSE42) 1769 return _mm_cmpgt_epi64(b, a);
1770 #elif defined(NLIB_NEON) 1772 return NLIB_CMP(vcltq, s64, a, b, u64);
1774 int32x2x2_t trn_a = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)),
1775 vreinterpret_s32_s8(vget_high_s8(a)));
1776 int32x2x2_t trn_b = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)),
1777 vreinterpret_s32_s8(vget_high_s8(b)));
1778 uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1779 uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1780 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1781 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1782 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1783 return vreinterpretq_s8_s64(result);
1786 i128 cmp = I128::CmpLtInt32(a, b);
1787 i128 eq = I128::CmpEq32(a, b);
1788 i128 cmp_lt = I128::CmpLtUint32(a, b);
1789 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1790 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1791 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1792 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1797 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1798 #if defined(NLIB_SSE41) 1799 return _mm_cmpgt_epi8(a, b);
1800 #elif defined(NLIB_NEON) 1801 return NLIB_CMP(vcgtq, s8, a, b, u8);
1806 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1807 #if defined(NLIB_SSE41) 1808 return _mm_cmpgt_epi16(a, b);
1809 #elif defined(NLIB_NEON) 1810 return NLIB_CMP(vcgtq, s16, a, b, u16);
1815 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1816 #if defined(NLIB_SSE41) 1817 return _mm_cmpgt_epi32(a, b);
1818 #elif defined(NLIB_NEON) 1819 return NLIB_CMP(vcgtq, s32, a, b, u32);
1824 NLIB_M(i128) I128::CmpGtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1825 #if defined(NLIB_SSE42) 1826 return _mm_cmpgt_epi64(a, b);
1827 #elif defined(NLIB_NEON) && defined(__aarch64__) 1828 return NLIB_CMP(vcgtq, s64, a, b, u64);
1830 return I128::CmpLtInt64(b, a);
1835 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1836 #if defined(NLIB_SSE41) 1837 i128 ofs = I128::SetValue(0x80, each_uint8);
1838 return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1839 #elif defined(NLIB_NEON) 1840 return NLIB_CMP(vcltq, u8, a, b, u8);
1845 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1846 #if defined(NLIB_SSE41) 1847 i128 ofs = I128::SetValue(0x80, each_uint8);
1848 return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1849 #elif defined(NLIB_NEON) 1850 return NLIB_CMP(vcgtq, u8, a, b, u8);
1855 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1856 #if defined(NLIB_SSE41) 1857 i128 ofs = I128::SetValue(0x8000U, each_uint16);
1858 return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1859 #elif defined(NLIB_NEON) 1860 return NLIB_CMP(vcltq, u16, a, b, u16);
1865 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1866 #if defined(NLIB_SSE41) 1867 i128 ofs = I128::SetValue(0x8000U, each_uint16);
1868 return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1869 #elif defined(NLIB_NEON) 1870 return NLIB_CMP(vcgtq, u16, a, b, u16);
1875 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1876 #if defined(NLIB_SSE41) 1877 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1878 return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1879 #elif defined(NLIB_NEON) 1880 return NLIB_CMP(vcltq, u32, a, b, u32);
1885 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1886 #if defined(NLIB_SSE41) 1887 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1888 return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1889 #elif defined(NLIB_NEON) 1890 return NLIB_CMP(vcgtq, u32, a, b, u32);
1895 NLIB_M(i128) I128::CmpLtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1896 #if defined(NLIB_SSE42) 1897 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1898 return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1899 #elif defined(NLIB_NEON) 1901 return NLIB_CMP(vcltq, u64, a, b, u64);
1903 uint32x2x2_t trn_a = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)),
1904 vreinterpret_u32_s8(vget_high_s8(a)));
1905 uint32x2x2_t trn_b = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)),
1906 vreinterpret_u32_s8(vget_high_s8(b)));
1907 uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1908 uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1909 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1910 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1911 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1912 return vreinterpretq_s8_s64(result);
1915 i128 cmp = I128::CmpLtUint32(a, b);
1916 i128 eq = I128::CmpEq32(a, b);
1917 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1918 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1919 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1920 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1925 NLIB_M(i128) I128::CmpGtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1926 #if defined(NLIB_SSE42) 1927 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1928 return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1929 #elif defined(NLIB_NEON) && defined(__aarch64__) 1930 return NLIB_CMP(vcgtq, u64, a, b, u64);
1932 return I128::CmpLtUint64(b, a);
1937 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1938 #if defined(NLIB_SSE41) 1939 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1940 #elif defined(NLIB_NEON) 1941 return NLIB_CMP(vcleq, s8, a, b, u8);
1946 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1947 #if defined(NLIB_SSE41) 1948 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1949 #elif defined(NLIB_NEON) 1950 return NLIB_CMP(vcleq, s16, a, b, u16);
1955 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1956 #if defined(NLIB_SSE41) 1957 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1958 #elif defined(NLIB_NEON) 1959 return NLIB_CMP(vcleq, s32, a, b, u32);
1964 NLIB_M(i128) I128::CmpLeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1965 #if defined(NLIB_SSE42) 1966 return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1967 #elif defined(NLIB_NEON) && defined(__aarch64__) 1968 return NLIB_CMP(vcleq, s64, a, b, u64);
1970 return I128::Not(I128::CmpGtInt64(a, b));
1975 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1976 #if defined(NLIB_SSE41) 1977 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1978 #elif defined(NLIB_NEON) 1979 return NLIB_CMP(vcgeq, s8, a, b, u8);
1984 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1985 #if defined(NLIB_SSE41) 1986 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1987 #elif defined(NLIB_NEON) 1988 return NLIB_CMP(vcgeq, s16, a, b, u16);
1993 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1994 #if defined(NLIB_SSE41) 1995 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1996 #elif defined(NLIB_NEON) 1997 return NLIB_CMP(vcgeq, s32, a, b, u32);
2002 NLIB_M(i128) I128::CmpGeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2003 #if defined(NLIB_SSE42) 2004 return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
2005 #elif defined(NLIB_NEON) && defined(__aarch64__) 2006 return NLIB_CMP(vcgeq, s64, a, b, u64);
2008 return I128::Not(I128::CmpLtInt64(a, b));
2013 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2014 #if defined(NLIB_SSE41) 2015 return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2016 #elif defined(NLIB_NEON) 2017 return NLIB_CMP(vcleq, u8, a, b, u8);
2022 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2023 #if defined(NLIB_SSE41) 2024 return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2025 #elif defined(NLIB_NEON) 2026 return NLIB_CMP(vcleq, u16, a, b, u16);
2031 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2032 #if defined(NLIB_SSE41) 2033 return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2034 #elif defined(NLIB_NEON) 2035 return NLIB_CMP(vcleq, u32, a, b, u32);
2040 NLIB_M(i128) I128::CmpLeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2041 #if defined(NLIB_SSE42) 2042 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2043 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2044 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2045 #elif defined(NLIB_NEON) && defined(__aarch64__) 2046 return NLIB_CMP(vcleq, u64, a, b, u64);
2048 return I128::Not(I128::CmpGtUint64(a, b));
2053 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2054 #if defined(NLIB_SSE41) 2055 return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2056 #elif defined(NLIB_NEON) 2057 return NLIB_CMP(vcgeq, u8, a, b, u8);
2062 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2063 #if defined(NLIB_SSE41) 2064 return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2065 #elif defined(NLIB_NEON) 2066 return NLIB_CMP(vcgeq, u16, a, b, u16);
2071 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2072 #if defined(NLIB_SSE41) 2073 return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2074 #elif defined(NLIB_NEON) 2075 return NLIB_CMP(vcgeq, u32, a, b, u32);
2080 NLIB_M(i128) I128::CmpGeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2081 #if defined(NLIB_SSE42) 2082 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2083 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2084 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2085 #elif defined(NLIB_NEON) && defined(__aarch64__) 2086 return NLIB_CMP(vcgeq, u64, a, b, u64);
2088 return I128::Not(I128::CmpLtUint64(a, b));
2092 NLIB_M(i128) I128::CmpEqZero8(i128arg value)
NLIB_NOEXCEPT {
2093 #if defined(__aarch64__) 2094 return vceqzq_s8(value);
2096 return I128::CmpEq8(value, I128::SetZero());
2100 NLIB_M(i128) I128::CmpEqZero16(i128arg value)
NLIB_NOEXCEPT {
2101 #if defined(__aarch64__) 2102 return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2104 return I128::CmpEq16(value, I128::SetZero());
2108 NLIB_M(i128) I128::CmpEqZero32(i128arg value)
NLIB_NOEXCEPT {
2109 #if defined(__aarch64__) 2110 return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2112 return I128::CmpEq32(value, I128::SetZero());
2116 NLIB_M(i128) I128::CmpEqZero64(i128arg value)
NLIB_NOEXCEPT {
2117 #if defined(__aarch64__) 2118 return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2120 return I128::CmpEq64(value, I128::SetZero());
2125 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT {
2126 #if defined(NLIB_SSE41) 2127 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2128 __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2129 __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2130 return I128::NarrowFrom16To8(xl, xh);
2131 #elif defined(NLIB_NEON) 2132 return NLIB_SFT(vshlq, u8, value, count, s8);
2137 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT {
2138 #if defined(NLIB_SSE41) 2139 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2140 __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2141 __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2142 return _mm_packus_epi16(xl, xh);
2143 #elif defined(NLIB_NEON) 2144 return NLIB_SFT(vshlq, u8, value, -count, s8);
2149 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT {
2150 #if defined(NLIB_SSE41) 2151 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2152 __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2153 __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2154 return _mm_packus_epi16(xl, xh);
2155 #elif defined(NLIB_NEON) 2156 return NLIB_SFT(vshlq, s8, value, -count, s8);
2161 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT {
2162 #if defined(NLIB_SSE41) 2163 return _mm_slli_epi16(value, count);
2164 #elif defined(NLIB_NEON) 2165 return NLIB_SFT(vshlq, u16, value, count, s16);
2170 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT {
2171 #if defined(NLIB_SSE41) 2172 return _mm_srli_epi16(value, count);
2173 #elif defined(NLIB_NEON) 2174 return NLIB_SFT(vshlq, u16, value, -count, s16);
2179 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT {
2180 #if defined(NLIB_SSE41) 2181 return _mm_srai_epi16(value, count);
2182 #elif defined(NLIB_NEON) 2183 return NLIB_SFT(vshlq, s16, value, -count, s16);
2188 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT {
2189 #if defined(NLIB_SSE41) 2190 return _mm_slli_epi32(value, count);
2191 #elif defined(NLIB_NEON) 2192 return NLIB_SFT(vshlq, u32, value, count, s32);
2197 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT {
2198 #if defined(NLIB_SSE41) 2199 return _mm_srli_epi32(value, count);
2200 #elif defined(NLIB_NEON) 2201 return NLIB_SFT(vshlq, u32, value, -count, s32);
2206 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT {
2207 #if defined(NLIB_SSE41) 2208 return _mm_srai_epi32(value, count);
2209 #elif defined(NLIB_NEON) 2210 return NLIB_SFT(vshlq, s32, value, -count, s32);
2215 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT {
2216 #if defined(NLIB_SSE41) 2217 return _mm_slli_epi64(value, count);
2218 #elif defined(NLIB_NEON) 2219 return NLIB_SFT(vshlq, u64, value, count, s64);
2224 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT {
2225 #if defined(NLIB_SSE41) 2226 return _mm_srli_epi64(value, count);
2227 #elif defined(NLIB_NEON) 2228 return NLIB_SFT(vshlq, u64, value, -count, s64);
2233 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value)
NLIB_NOEXCEPT {
2236 return vshlq_n_s8(value, N);
2238 return I128::ShiftLeftLogical8(value, N);
2243 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value)
NLIB_NOEXCEPT {
2246 uint8x16_t tmp = vreinterpretq_u8_s8(value);
2247 return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2249 return I128::ShiftRightLogical8(value, N);
2254 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value)
NLIB_NOEXCEPT {
2257 return vshrq_n_s8(value, N);
2259 return I128::ShiftRightArithmetic8(value, N);
2264 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value)
NLIB_NOEXCEPT {
2267 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2268 return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2270 return I128::ShiftLeftLogical16(value, N);
2275 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value)
NLIB_NOEXCEPT {
2278 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2279 return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2281 return I128::ShiftRightLogical16(value, N);
2286 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value)
NLIB_NOEXCEPT {
2289 int16x8_t tmp = vreinterpretq_s16_s8(value);
2290 return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2292 return I128::ShiftRightArithmetic16(value, N);
2297 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value)
NLIB_NOEXCEPT {
2300 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2301 return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2303 return I128::ShiftLeftLogical32(value, N);
2308 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value)
NLIB_NOEXCEPT {
2311 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2312 return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2314 return I128::ShiftRightLogical32(value, N);
2319 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value)
NLIB_NOEXCEPT {
2322 int32x4_t tmp = vreinterpretq_s32_s8(value);
2323 return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2325 return I128::ShiftRightArithmetic32(value, N);
2330 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value)
NLIB_NOEXCEPT {
2333 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2334 return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2336 return I128::ShiftLeftLogical64(value, N);
2341 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value)
NLIB_NOEXCEPT {
2344 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2345 return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2347 return I128::ShiftRightLogical64(value, N);
2353 NLIB_M(i128) I128::ShiftLeftLogical8<8>(i128arg value)
NLIB_NOEXCEPT {
2355 return I128::SetZero();
2358 NLIB_M(i128) I128::ShiftRightLogical8<0>(i128arg value)
NLIB_NOEXCEPT {
2362 NLIB_M(i128) I128::ShiftLeftLogical16<16>(i128arg value)
NLIB_NOEXCEPT {
2364 return I128::SetZero();
2367 NLIB_M(i128) I128::ShiftRightLogical16<0>(i128arg value)
NLIB_NOEXCEPT {
2371 NLIB_M(i128) I128::ShiftRightArithmetic16<0>(i128arg value)
NLIB_NOEXCEPT {
2375 NLIB_M(i128) I128::ShiftLeftLogical32<32>(i128arg value)
NLIB_NOEXCEPT {
2377 return I128::SetZero();
2380 NLIB_M(i128) I128::ShiftRightLogical32<0>(i128arg value)
NLIB_NOEXCEPT {
2384 NLIB_M(i128) I128::ShiftRightArithmetic32<0>(i128arg value)
NLIB_NOEXCEPT {
2388 NLIB_M(i128) I128::ShiftLeftLogical64<64>(i128arg value)
NLIB_NOEXCEPT {
2390 return I128::SetZero();
2393 NLIB_M(i128) I128::ShiftRightLogical64<0>(i128arg value)
NLIB_NOEXCEPT {
2400 NLIB_M(i128) I128::ByteShiftLeft(i128arg value)
NLIB_NOEXCEPT {
2402 #if defined(NLIB_SSE41) 2403 return _mm_slli_si128(value, N);
2404 #elif defined(NLIB_NEON) 2405 return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2411 NLIB_M(i128) I128::ByteShiftRight(i128arg value)
NLIB_NOEXCEPT {
2413 #if defined(NLIB_SSE41) 2414 return _mm_srli_si128(value, N);
2415 #elif defined(NLIB_NEON) 2416 return vextq_s8(value, vdupq_n_s8(0), N);
2422 NLIB_M(i128) I128::ByteRotateRight(i128arg value)
NLIB_NOEXCEPT {
2424 #if defined(NLIB_SSE41) 2425 return _mm_alignr_epi8(value, value, N);
2426 #elif defined(NLIB_NEON) 2427 return vextq_s8(value, value, N);
2433 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2435 #if defined(NLIB_SSE41) 2436 return _mm_alignr_epi8(a, b, N);
2437 #elif defined(NLIB_NEON) 2438 return vextq_s8(b, a, N);
2443 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2444 #if defined(NLIB_SSE41) 2445 i128 mask = I128::SetValue(0x00FFU, each_uint16);
2446 __m128i lo_mask = _mm_and_si128(lo, mask);
2447 __m128i hi_mask = _mm_and_si128(hi, mask);
2448 return _mm_packus_epi16(lo_mask, hi_mask);
2449 #elif defined(NLIB_NEON) 2451 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2452 return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2454 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2455 uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2456 return NLIB_CMB(u8, l, h);
2462 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2463 #if defined(NLIB_SSE41) 2464 i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2465 __m128i lo_mask = _mm_and_si128(lo, mask);
2466 __m128i hi_mask = _mm_and_si128(hi, mask);
2467 return _mm_packus_epi32(lo_mask, hi_mask);
2468 #elif defined(NLIB_NEON) 2470 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2471 return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2473 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2474 uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2475 return NLIB_CMB(u16, l, h);
2481 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2482 #if defined(NLIB_SSE41) 2483 __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2484 __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2485 return _mm_unpacklo_epi64(lo_, hi_);
2486 #elif defined(NLIB_NEON) 2488 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2489 return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2491 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2492 uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2493 return NLIB_CMB(u32, l, h);
2499 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2500 #if defined(NLIB_SSE41) 2501 i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
2502 __m128i lotmp = _mm_and_si128(lo, b7FFF);
2503 __m128i hitmp = _mm_and_si128(hi, b7FFF);
2504 return _mm_packus_epi16(lotmp, hitmp);
2505 #elif defined(NLIB_NEON) 2507 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2508 return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2510 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2511 uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2512 return NLIB_CMB(u8, l, h);
2518 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2519 #if defined(NLIB_SSE41) 2520 return _mm_packs_epi16(lo, hi);
2521 #elif defined(NLIB_NEON) 2523 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2524 return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2526 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2527 int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2528 return NLIB_CMB(s8, l, h);
2534 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2535 #if defined(NLIB_SSE41) 2536 i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
2537 __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2538 __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2539 return _mm_packus_epi32(lotmp, hitmp);
2540 #elif defined(NLIB_NEON) 2542 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2543 return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2545 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2546 uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2547 return NLIB_CMB(u16, l, h);
2553 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2554 #if defined(NLIB_SSE41) 2555 return _mm_packs_epi32(lo, hi);
2556 #elif defined(NLIB_NEON) 2558 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2559 return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2561 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2562 int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2563 return NLIB_CMB(s16, l, h);
2569 NLIB_M(i128) I128::ConvertFromInt8ToInt16Lo(i128arg value)
NLIB_NOEXCEPT {
2570 #if defined(NLIB_SSE41) 2571 return _mm_cvtepi8_epi16(value);
2572 #elif defined(NLIB_NEON) 2573 return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2578 NLIB_M(i128) I128::ConvertFromInt8ToInt16Hi(i128arg value)
NLIB_NOEXCEPT {
2579 #if defined(NLIB_SSE41) 2580 return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2581 #elif defined(NLIB_NEON) 2583 int16x8_t result = vmovl_high_s8(value);
2585 int16x8_t result = vmovl_s8(vget_high_s8(value));
2587 return vreinterpretq_s8_s16(result);
2592 NLIB_M(i128) I128::ConvertFromInt16ToInt32Lo(i128arg value)
NLIB_NOEXCEPT {
2593 #if defined(NLIB_SSE41) 2594 return _mm_cvtepi16_epi32(value);
2595 #elif defined(NLIB_NEON) 2596 int16x8_t x = vreinterpretq_s16_s8(value);
2597 int32x4_t result = vmovl_s16(vget_low_s16(x));
2598 return vreinterpretq_s8_s32(result);
2603 NLIB_M(i128) I128::ConvertFromInt16ToInt32Hi(i128arg value)
NLIB_NOEXCEPT {
2604 #if defined(NLIB_SSE41) 2605 return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2606 #elif defined(NLIB_NEON) 2607 int16x8_t x = vreinterpretq_s16_s8(value);
2609 int32x4_t result = vmovl_high_s16(x);
2611 int32x4_t result = vmovl_s16(vget_high_s16(x));
2613 return vreinterpretq_s8_s32(result);
2618 NLIB_M(i128) I128::ConvertFromInt32ToInt64Lo(i128arg value)
NLIB_NOEXCEPT {
2619 #if defined(NLIB_SSE41) 2620 return _mm_cvtepi32_epi64(value);
2621 #elif defined(NLIB_NEON) 2622 int32x4_t x = vreinterpretq_s32_s8(value);
2623 int64x2_t result = vmovl_s32(vget_low_s32(x));
2624 return vreinterpretq_s8_s64(result);
2629 NLIB_M(i128) I128::ConvertFromInt32ToInt64Hi(i128arg value)
NLIB_NOEXCEPT {
2630 #if defined(NLIB_SSE41) 2631 return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2632 #elif defined(NLIB_NEON) 2633 int32x4_t x = vreinterpretq_s32_s8(value);
2635 int64x2_t result = vmovl_high_s32(x);
2637 int64x2_t result = vmovl_s32(vget_high_s32(x));
2639 return vreinterpretq_s8_s64(result);
2644 NLIB_M(i128) I128::ConvertFromUint8ToUint16Lo(i128arg value)
NLIB_NOEXCEPT {
2645 #if defined(NLIB_SSE41) 2646 return _mm_cvtepu8_epi16(value);
2647 #elif defined(NLIB_NEON) 2648 uint8x16_t x = vreinterpretq_u8_s8(value);
2649 uint16x8_t result = vmovl_u8(vget_low_u8(x));
2650 return vreinterpretq_s8_u16(result);
2655 NLIB_M(i128) I128::ConvertFromUint8ToUint16Hi(i128arg value)
NLIB_NOEXCEPT {
2656 #if defined(NLIB_SSE41) 2657 return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2658 #elif defined(NLIB_NEON) 2659 uint8x16_t x = vreinterpretq_u8_s8(value);
2661 uint16x8_t result = vmovl_high_u8(x);
2663 uint16x8_t result = vmovl_u8(vget_high_u8(x));
2665 return vreinterpretq_s8_u16(result);
2670 NLIB_M(i128) I128::ConvertFromUint16ToUint32Lo(i128arg value)
NLIB_NOEXCEPT {
2671 #if defined(NLIB_SSE41) 2672 return _mm_cvtepu16_epi32(value);
2673 #elif defined(NLIB_NEON) 2674 uint16x8_t x = vreinterpretq_u16_s8(value);
2675 uint32x4_t result = vmovl_u16(vget_low_u16(x));
2676 return vreinterpretq_s8_u32(result);
2681 NLIB_M(i128) I128::ConvertFromUint16ToUint32Hi(i128arg value)
NLIB_NOEXCEPT {
2682 #if defined(NLIB_SSE41) 2683 return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2684 #elif defined(NLIB_NEON) 2685 uint16x8_t x = vreinterpretq_u16_s8(value);
2687 uint32x4_t result = vmovl_high_u16(x);
2689 uint32x4_t result = vmovl_u16(vget_high_u16(x));
2691 return vreinterpretq_s8_u32(result);
2696 NLIB_M(i128) I128::ConvertFromUint32ToUint64Lo(i128arg value)
NLIB_NOEXCEPT {
2697 #if defined(NLIB_SSE41) 2698 return _mm_cvtepu32_epi64(value);
2699 #elif defined(NLIB_NEON) 2700 uint32x4_t x = vreinterpretq_u32_s8(value);
2701 uint64x2_t result = vmovl_u32(vget_low_u32(x));
2702 return vreinterpretq_s8_u64(result);
2707 NLIB_M(i128) I128::ConvertFromUint32ToUint64Hi(i128arg value)
NLIB_NOEXCEPT {
2708 #if defined(NLIB_SSE41) 2709 return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2710 #elif defined(NLIB_NEON) 2711 uint32x4_t x = vreinterpretq_u32_s8(value);
2713 uint64x2_t result = vmovl_high_u32(x);
2715 uint64x2_t result = vmovl_u32(vget_high_u32(x));
2717 return vreinterpretq_s8_u64(result);
2722 NLIB_M(i128) I128::Zip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2723 #if defined(NLIB_SSE41) 2724 return _mm_unpacklo_epi8(a, b);
2725 #elif defined(NLIB_NEON) 2727 return vzip1q_s8(a, b);
2729 return vzipq_s8(a, b).val[0];
2735 NLIB_M(i128) I128::Zip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2736 #if defined(NLIB_SSE41) 2737 return _mm_unpackhi_epi8(a, b);
2738 #elif defined(NLIB_NEON) 2740 return vzip2q_s8(a, b);
2742 return vzipq_s8(a, b).val[1];
2747 NLIB_M(i128) I128::Unzip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2748 #if defined(NLIB_SSE41) 2749 i128 mask = I128::SetValue(0x00FFU, each_uint16);
2750 __m128i lo_mask = _mm_and_si128(a, mask);
2751 __m128i hi_mask = _mm_and_si128(b, mask);
2752 return _mm_packus_epi16(lo_mask, hi_mask);
2753 #elif defined(NLIB_NEON) 2755 return vuzp1q_s8(a, b);
2757 return vuzpq_s8(a, b).val[0];
2762 NLIB_M(i128) I128::Unzip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2763 #if defined(NLIB_SSE41) 2764 i128 mask = I128::SetValue(0xFF00U, each_uint16);
2765 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2766 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2767 return _mm_packus_epi16(lo_mask, hi_mask);
2768 #elif defined(NLIB_NEON) 2770 return vuzp2q_s8(a, b);
2772 return vuzpq_s8(a, b).val[1];
2778 NLIB_M(i128) I128::Zip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2779 #if defined(NLIB_SSE41) 2780 return _mm_unpacklo_epi16(a, b);
2781 #elif defined(NLIB_NEON) 2783 return NLIB_OP2(vzip1q, u16, a, b);
2785 return vreinterpretq_s8_u16(vzipq_u16(
2786 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2792 NLIB_M(i128) I128::Zip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2793 #if defined(NLIB_SSE41) 2794 return _mm_unpackhi_epi16(a, b);
2795 #elif defined(NLIB_NEON) 2797 return NLIB_OP2(vzip2q, u16, a, b);
2799 return vreinterpretq_s8_u16(vzipq_u16(
2800 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2805 NLIB_M(i128) I128::Unzip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2806 #if defined(NLIB_SSE41) 2807 i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2808 __m128i lo_mask = _mm_and_si128(a, mask);
2809 __m128i hi_mask = _mm_and_si128(b, mask);
2810 return _mm_packus_epi32(lo_mask, hi_mask);
2811 #elif defined(NLIB_NEON) 2813 return NLIB_OP2(vuzp1q, u16, a, b);
2815 return vreinterpretq_s8_u16(vuzpq_u16(
2816 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2821 NLIB_M(i128) I128::Unzip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2822 #if defined(NLIB_SSE41) 2823 i128 mask = I128::SetValue(0xFFFF0000U, each_uint32);
2824 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2825 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2826 return _mm_packus_epi32(lo_mask, hi_mask);
2827 #elif defined(NLIB_NEON) 2829 return NLIB_OP2(vuzp2q, u16, a, b);
2831 return vreinterpretq_s8_u16(vuzpq_u16(
2832 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2838 NLIB_M(i128) I128::Zip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2839 #if defined(NLIB_SSE41) 2840 return _mm_unpacklo_epi32(a, b);
2841 #elif defined(NLIB_NEON) 2843 return NLIB_OP2(vzip1q, u32, a, b);
2845 return vreinterpretq_s8_u32(vzipq_u32(
2846 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2852 NLIB_M(i128) I128::Zip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2853 #if defined(NLIB_SSE41) 2854 return _mm_unpackhi_epi32(a, b);
2855 #elif defined(NLIB_NEON) 2857 return NLIB_OP2(vzip2q, u32, a, b);
2859 return vreinterpretq_s8_u32(vzipq_u32(
2860 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2865 NLIB_M(i128) I128::Unzip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2866 #if defined(NLIB_SSE41) 2867 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2868 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2869 return _mm_blend_epi16(x0, x1, 0xF0);
2870 #elif defined(NLIB_NEON) 2872 return NLIB_OP2(vuzp1q, u32, a, b);
2874 return vreinterpretq_s8_u32(vuzpq_u32(
2875 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2880 NLIB_M(i128) I128::Unzip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2881 #if defined(NLIB_SSE41) 2882 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2883 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2884 return _mm_blend_epi16(x0, x1, 0xF0);
2885 #elif defined(NLIB_NEON) 2887 return NLIB_OP2(vuzp2q, u32, a, b);
2889 return vreinterpretq_s8_u32(vuzpq_u32(
2890 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2895 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
2896 int V8,
int V9,
int V10,
int V11,
int V12,
int V13,
int V14,
int V15>
2897 NLIB_M(i128) I128::Permute8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2898 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2899 return __builtin_shufflevector(
2901 V0, V1, V2, V3, V4, V5, V6, V7,
2902 V8, V9, V10, V11, V12, V13, V14, V15);
2903 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2904 return __builtin_shufflevector((__v16qi)a, (__v16qi)b,
2905 V0, V1, V2, V3, V4, V5, V6, V7,
2906 V8, V9, V10, V11, V12, V13, V14, V15);
2909 (V0 < 0 || V0 > 15) ? -128 : V0,
2910 (V1 < 0 || V1 > 15) ? -128 : V1,
2911 (V2 < 0 || V2 > 15) ? -128 : V2,
2912 (V3 < 0 || V3 > 15) ? -128 : V3,
2913 (V4 < 0 || V4 > 15) ? -128 : V4,
2914 (V5 < 0 || V5 > 15) ? -128 : V5,
2915 (V6 < 0 || V6 > 15) ? -128 : V6,
2916 (V7 < 0 || V7 > 15) ? -128 : V7,
2917 (V8 < 0 || V8 > 15) ? -128 : V8,
2918 (V9 < 0 || V9 > 15) ? -128 : V9,
2919 (V10 < 0 || V10 > 15) ? -128 : V10,
2920 (V11 < 0 || V11 > 15) ? -128 : V11,
2921 (V12 < 0 || V12 > 15) ? -128 : V12,
2922 (V13 < 0 || V13 > 15) ? -128 : V13,
2923 (V14 < 0 || V14 > 15) ? -128 : V14,
2924 (V15 < 0 || V15 > 15) ? -128 : V15
2927 V0 < 16 ? -128 : (V0 - 16),
2928 V1 < 16 ? -128 : (V1 - 16),
2929 V2 < 16 ? -128 : (V2 - 16),
2930 V3 < 16 ? -128 : (V3 - 16),
2931 V4 < 16 ? -128 : (V4 - 16),
2932 V5 < 16 ? -128 : (V5 - 16),
2933 V6 < 16 ? -128 : (V6 - 16),
2934 V7 < 16 ? -128 : (V7 - 16),
2935 V8 < 16 ? -128 : (V8 - 16),
2936 V9 < 16 ? -128 : (V9 - 16),
2937 V10 < 16 ? -128 : (V10 - 16),
2938 V11 < 16 ? -128 : (V11 - 16),
2939 V12 < 16 ? -128 : (V12 - 16),
2940 V13 < 16 ? -128 : (V13 - 16),
2941 V14 < 16 ? -128 : (V14 - 16),
2942 V15 < 16 ? -128 : (V15 - 16)
2944 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2945 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2946 return I128::Or(tmp_a, tmp_b);
2950 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
2951 NLIB_M(i128) I128::Permute16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2952 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2953 return vreinterpretq_s8_u16(__builtin_shufflevector(
2954 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b),
2955 V0, V1, V2, V3, V4, V5, V6, V7));
2956 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2957 return __builtin_shufflevector((__v8hi)a, (__v8hi)b,
2958 V0, V1, V2, V3, V4, V5, V6, V7);
2961 (V0 < 0 || V0 > 7) ? -128 : V0 * 2,
2962 (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2963 (V1 < 0 || V1 > 7) ? -128 : V1 * 2,
2964 (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2965 (V2 < 0 || V2 > 7) ? -128 : V2 * 2,
2966 (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2967 (V3 < 0 || V3 > 7) ? -128 : V3 * 2,
2968 (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2969 (V4 < 0 || V4 > 7) ? -128 : V4 * 2,
2970 (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2971 (V5 < 0 || V5 > 7) ? -128 : V5 * 2,
2972 (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2973 (V6 < 0 || V6 > 7) ? -128 : V6 * 2,
2974 (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2975 (V7 < 0 || V7 > 7) ? -128 : V7 * 2,
2976 (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1
2979 V0 < 8 ? -128 : (V0 - 8) * 2,
2980 V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2981 V1 < 8 ? -128 : (V1 - 8) * 2,
2982 V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2983 V2 < 8 ? -128 : (V2 - 8) * 2,
2984 V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2985 V3 < 8 ? -128 : (V3 - 8) * 2,
2986 V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2987 V4 < 8 ? -128 : (V4 - 8) * 2,
2988 V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2989 V5 < 8 ? -128 : (V5 - 8) * 2,
2990 V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2991 V6 < 8 ? -128 : (V6 - 8) * 2,
2992 V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2993 V7 < 8 ? -128 : (V7 - 8) * 2,
2994 V7 < 8 ? -128 : (V7 - 8) * 2 + 1
2996 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2997 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2998 return I128::Or(tmp_a, tmp_b);
3002 template<
int V0,
int V1,
int V2,
int V3>
3003 NLIB_M(i128) I128::Permute32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
3004 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 3005 return vreinterpretq_s8_u32(__builtin_shufflevector(
3006 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b),
3008 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 3009 return __builtin_shufflevector((__v4si)a, (__v4si)b,
3013 (V0 < 0 || V0 > 3) ? -128 : V0 * 4,
3014 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
3015 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2,
3016 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
3017 (V1 < 0 || V1 > 3) ? -128 : V1 * 4,
3018 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
3019 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2,
3020 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
3021 (V2 < 0 || V2 > 3) ? -128 : V2 * 4,
3022 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
3023 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2,
3024 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
3025 (V3 < 0 || V3 > 3) ? -128 : V3 * 4,
3026 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
3027 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2,
3028 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3
3031 V0 < 4 ? -128 : (V0 - 4) * 4,
3032 V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
3033 V0 < 4 ? -128 : (V0 - 4) * 4 + 2,
3034 V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
3035 V1 < 4 ? -128 : (V1 - 4) * 4,
3036 V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
3037 V1 < 4 ? -128 : (V1 - 4) * 4 + 2,
3038 V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
3039 V2 < 4 ? -128 : (V2 - 4) * 4,
3040 V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
3041 V2 < 4 ? -128 : (V2 - 4) * 4 + 2,
3042 V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
3043 V3 < 4 ? -128 : (V3 - 4) * 4,
3044 V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
3045 V3 < 4 ? -128 : (V3 - 4) * 4 + 2,
3046 V3 < 4 ? -128 : (V3 - 4) * 4 + 3
3048 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
3049 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3050 return I128::Or(tmp_a, tmp_b);
3057 #if defined(NLIB_SSE41) 3059 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3061 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3062 #elif defined(NLIB_NEON) 3063 return NLIB_OP1(vrev16q, u8, value);
3069 #if defined(NLIB_SSE41) 3071 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3073 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3074 #elif defined(NLIB_NEON) 3075 return NLIB_OP1(vrev32q, u8, value);
3081 #if defined(NLIB_SSE41) 3083 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3085 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3086 #elif defined(NLIB_NEON) 3087 return NLIB_OP1(vrev64q, u8, value);
3093 #if defined(NLIB_SSE41) 3094 return _mm_movemask_epi8(value);
3095 #elif defined(NLIB_NEON) 3096 uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3097 uint8x16_t a = vandq_u8(value, powers);
3099 return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3101 uint8x8_t al = vget_low_u8(a);
3102 uint8x8_t ah = vget_high_u8(a);
3103 uint8x8_t tmp = vpadd_u8(al, ah);
3104 tmp = vpadd_u8(tmp, tmp);
3105 tmp = vpadd_u8(tmp, tmp);
3106 return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3113 #if defined(NLIB_SSE41) 3114 __m128i tmp = _mm_packs_epi16(value, value);
3115 return _mm_movemask_epi8(tmp) & 255;
3116 #elif defined(NLIB_NEON) 3117 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3118 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3119 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3120 uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3122 return vaddvq_u16(a);
3124 uint8x8_t tmp = vmovn_u16(a);
3125 tmp = vpadd_u8(tmp, tmp);
3126 tmp = vpadd_u8(tmp, tmp);
3127 tmp = vpadd_u8(tmp, tmp);
3128 return vget_lane_u8(tmp, 0);
3135 #if defined(NLIB_SSE41) 3136 __m128i tmp = _mm_packs_epi16(value, value);
3137 tmp = _mm_packs_epi16(tmp, tmp);
3138 return _mm_movemask_epi8(tmp) & 15;
3139 #elif defined(NLIB_NEON) 3140 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3141 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3142 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3143 uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3145 return vaddvq_u32(a);
3147 uint16x4_t tmp = vmovn_u32(a);
3148 tmp = vpadd_u16(tmp, tmp);
3149 tmp = vpadd_u16(tmp, tmp);
3150 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3156 #if defined(NLIB_NEON) 3157 int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3158 int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3159 int8x8_t s1 = vdup_n_s8(mask >> 8);
3160 return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3161 #elif defined(NLIB_SSE41) 3162 i128 m = I128::SetValue(0x8040201008040201ULL, each_uint64);
3163 i128 s0 = I128::SetValue(mask & 0xFF, each_int8);
3164 i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8), each_int8);
3165 i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3166 return I128::Test8(m, s);
3171 #if defined(NLIB_NEON) 3172 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3173 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3174 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3175 uint16x8_t s = vdupq_n_u16(mask);
3176 return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3177 #elif defined(NLIB_SSE41) 3178 i128 m0 = I128::SetValue(0x0008000400020001ULL, each_uint64);
3179 i128 m1 = I128::SetValue(0x0080004000200010ULL, each_uint64);
3180 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3181 i128 s = I128::SetValue(static_cast<int16_t>(mask), each_int16);
3182 return I128::Test16(m, s);
3187 #if defined(NLIB_NEON) 3188 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3189 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3190 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3191 uint32x4_t s = vdupq_n_u32(mask);
3192 return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3193 #elif defined(NLIB_SSE41) 3194 i128 m0 = I128::SetValue(0x0000000200000001ULL, each_uint64);
3195 i128 m1 = I128::SetValue(0x0000000800000004ULL, each_uint64);
3196 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3197 i128 s = I128::SetValue(mask, each_int32);
3198 return I128::Test32(m, s);
3204 #if defined(NLIB_SSE41) 3205 return _mm_testz_si128(value, value) != 0;
3206 #elif defined(NLIB_NEON) 3208 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3209 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3213 int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3214 return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3221 #if defined(NLIB_SSE41) 3222 return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3223 #elif defined(NLIB_NEON) 3225 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3226 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3228 int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3229 return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3235 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT {
3236 #if defined(NLIB_SSE41) 3237 return _mm_blendv_epi8(b, a, mask);
3238 #elif defined(NLIB_NEON) 3239 return NLIB_OP3(vbslq, u32, mask, a, b);
3244 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle)
NLIB_NOEXCEPT {
3245 #if defined(NLIB_SSE41) 3246 return _mm_shuffle_epi8(value, shuffle);
3247 #elif defined(NLIB_NEON) 3249 return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3252 x.val[0] = vget_low_s8(value);
3253 x.val[1] = vget_high_s8(value);
3254 int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3255 int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3256 return vcombine_s8(lo, hi);
3263 #if defined(NLIB_NEON) 3265 int8x16_t tmp = vnegq_s8(value);
3266 return vaddvq_s8(tmp);
3268 int8x16_t tmp = vnegq_s8(value);
3269 int8x8_t lo = vget_low_s8(tmp);
3270 int8x8_t hi = vget_high_s8(tmp);
3271 lo = vadd_s8(lo, hi);
3272 lo = vpadd_s8(lo, lo);
3273 lo = vpadd_s8(lo, lo);
3274 lo = vpadd_s8(lo, lo);
3275 return vget_lane_s8(lo, 0);
3278 return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3283 return nlib_clz32(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3287 return nlib_ctz32(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3291 # undef vreinterpretq_s8_s8 3300 #endif // NLIB_DOXYGEN 3305 #if defined(NLIB_SSE41) 3306 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3308 row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \ 3309 row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \ 3310 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \ 3311 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \ 3312 __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \ 3313 __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \ 3314 __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \ 3315 __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \ 3316 row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \ 3317 row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \ 3318 row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \ 3319 row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \ 3321 #elif defined(NLIB_NEON) 3323 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3325 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \ 3326 vreinterpretq_u32_s8(row1)); \ 3327 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \ 3328 vreinterpretq_u32_s8(row3)); \ 3329 uint64x2_t row0_, row1_, row2_, row3_; \ 3330 row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3331 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3332 row0 = vreinterpretq_s8_u64(row0_); \ 3333 row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3334 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3335 row1 = vreinterpretq_s8_u64(row1_); \ 3336 row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3337 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3338 row2 = vreinterpretq_s8_u64(row2_); \ 3339 row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3340 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3341 row3 = vreinterpretq_s8_u64(row3_); \ 3344 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3346 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \ 3347 vreinterpretq_u32_s8(row1)); \ 3348 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \ 3349 vreinterpretq_u32_s8(row3)); \ 3350 uint32x4_t row0_, row1_, row2_, row3_; \ 3351 uint32x2_t lo, hi; \ 3352 lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \ 3353 row0_ = vcombine_u32(lo, hi); \ 3354 row0 = vreinterpretq_s8_u32(row0_); \ 3355 lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \ 3356 row1_ = vcombine_u32(lo, hi); \ 3357 row1 = vreinterpretq_s8_u32(row1_); \ 3358 lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \ 3359 row2_ = vcombine_u32(lo, hi); \ 3360 row2 = vreinterpretq_s8_u32(row2_); \ 3361 lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \ 3362 row3_ = vcombine_u32(lo, hi); \ 3363 row3 = vreinterpretq_s8_u32(row3_); \ 3373 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 空の構造体で32bitの符号付き整数を示すためのタグです。
空の構造体で64bitの符号付き整数を示すためのタグです。
constexpr const each_uint8_tag each_uint8
each_uint8_tag型の定数オブジェクトで、8bitの符号なし整数を示すためのタグです。
空の構造体で8bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で8bitの符号付き整数を示すためのタグです。
constexpr const each_uint16_tag each_uint16
each_uint16_tag型の定数オブジェクトで、16bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号なし整数を示すためのタグです。
空の構造体で64bitの符号なし整数を示すためのタグです。
constexpr const each_int64_tag each_int64
each_int64_tag型の定数オブジェクトで、64bitの符号付き整数を示すためのタグです。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
constexpr const each_uint64_tag each_uint64
each_uint64_tag型の定数オブジェクトで、64bitの符号なし整数を示すためのタグです。
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いた整数SIMD演算を行うためのクラスです。 ...
空の構造体で8bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号付き整数を示すためのタグです。
constexpr const each_int16_tag each_int16
each_int16_tag型の定数オブジェクトで、16bitの符号付き整数を示すためのタグです。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
空の構造体で16bit単位に分けたレーンを選択することを示すためのタグです。
constexpr const each_select16_tag each_select16
each_select16_tag型の定数オブジェクトで、16bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
__m128i nlib_i128_t
128bitの整数用SIMDレジスタのための型です。
constexpr const each_select8_tag each_select8
each_select8_tag型の定数オブジェクトで、8bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
空の構造体で32bitの符号なし整数を示すためのタグです。
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。