16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 21 #if defined(NLIB_SSE41) 23 #elif defined(NLIB_NEON) 67 #if !defined(_MSC_VER) || _MSC_VER < 1800 73 #if defined(NLIB_SIMD) 79 typedef const i128& i128arg;
81 typedef const i128 i128arg;
102 static i128 __vectorcall SetFull(i128arg dummy)
NLIB_NOEXCEPT;
104 static i128 __vectorcall LoadA16(
const void* p)
NLIB_NOEXCEPT;
105 static i128 __vectorcall LoadA8(
const void* p)
NLIB_NOEXCEPT;
106 static i128 __vectorcall LoadA4(
const void* p)
NLIB_NOEXCEPT;
107 static i128 __vectorcall LoadA2(
const void* p)
NLIB_NOEXCEPT;
108 static i128 __vectorcall LoadA1(
const void* p)
NLIB_NOEXCEPT;
109 static i128 __vectorcall LoadLoA8(
const void* p)
NLIB_NOEXCEPT;
110 static i128 __vectorcall LoadLoA4(
const void* p)
NLIB_NOEXCEPT;
111 static i128 __vectorcall LoadLoA2(
const void* p)
NLIB_NOEXCEPT;
112 static i128 __vectorcall LoadLoA1(
const void* p)
NLIB_NOEXCEPT;
113 static i128 __vectorcall LoadHiA8(
const void* p)
NLIB_NOEXCEPT;
114 static i128 __vectorcall LoadHiA4(
const void* p)
NLIB_NOEXCEPT;
115 static i128 __vectorcall LoadHiA2(
const void* p)
NLIB_NOEXCEPT;
116 static i128 __vectorcall LoadHiA1(
const void* p)
NLIB_NOEXCEPT;
118 #define NLIB_LOAD_REDIRECT(func) \ 119 static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \ 120 return func(reinterpret_cast<void*>(p)); \ 122 static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \ 123 return func(reinterpret_cast<void*>(p)); \ 125 NLIB_LOAD_REDIRECT(LoadA16)
126 NLIB_LOAD_REDIRECT(LoadA8)
127 NLIB_LOAD_REDIRECT(LoadA4)
128 NLIB_LOAD_REDIRECT(LoadA2)
129 NLIB_LOAD_REDIRECT(LoadA1)
130 NLIB_LOAD_REDIRECT(LoadLoA8)
131 NLIB_LOAD_REDIRECT(LoadLoA4)
132 NLIB_LOAD_REDIRECT(LoadLoA2)
133 NLIB_LOAD_REDIRECT(LoadLoA1)
134 NLIB_LOAD_REDIRECT(LoadHiA8)
135 NLIB_LOAD_REDIRECT(LoadHiA4)
136 NLIB_LOAD_REDIRECT(LoadHiA2)
137 NLIB_LOAD_REDIRECT(LoadHiA1)
138 #undef NLIB_LOAD_REDIRECT 140 static void __vectorcall StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT;
141 static void __vectorcall StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
142 static void __vectorcall StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
143 static void __vectorcall StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
144 static void __vectorcall StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
145 static void __vectorcall StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
146 static void __vectorcall StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
147 static void __vectorcall StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
148 static void __vectorcall StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
149 static void __vectorcall StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
150 static void __vectorcall StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
151 static void __vectorcall StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
152 static void __vectorcall StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
154 #define NLIB_STORE_REDIRECT(func) \ 155 static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \ 156 func(reinterpret_cast<void*>(p), value); \ 158 static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \ 159 func(reinterpret_cast<void*>(p), value); \ 161 NLIB_STORE_REDIRECT(StoreA16)
162 NLIB_STORE_REDIRECT(StoreA8)
163 NLIB_STORE_REDIRECT(StoreA4)
164 NLIB_STORE_REDIRECT(StoreA2)
165 NLIB_STORE_REDIRECT(StoreA1)
166 NLIB_STORE_REDIRECT(StoreLoA8)
167 NLIB_STORE_REDIRECT(StoreLoA4)
168 NLIB_STORE_REDIRECT(StoreLoA2)
169 NLIB_STORE_REDIRECT(StoreLoA1)
170 NLIB_STORE_REDIRECT(StoreHiA8)
171 NLIB_STORE_REDIRECT(StoreHiA4)
172 NLIB_STORE_REDIRECT(StoreHiA2)
173 NLIB_STORE_REDIRECT(StoreHiA1)
174 #undef NLIB_STORE_REDIRECT 180 static uint8_t __vectorcall GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT;
182 static uint16_t __vectorcall GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT;
184 static uint32_t __vectorcall GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT;
186 static uint64_t __vectorcall GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT;
188 static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v)
NLIB_NOEXCEPT;
190 static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT;
192 static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT;
194 static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT;
199 static i128 __vectorcall Add8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
200 static i128 __vectorcall Add16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
201 static i128 __vectorcall Add32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
202 static i128 __vectorcall Add64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
204 static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
205 static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
207 static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
208 static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
210 static i128 __vectorcall Sub8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
211 static i128 __vectorcall Sub16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
212 static i128 __vectorcall Sub32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
213 static i128 __vectorcall Sub64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
215 static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
216 static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
218 static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
219 static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
221 static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
222 static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
223 static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
229 static i128 __vectorcall Mult16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
230 static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
231 static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
232 static i128 __vectorcall Mult32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
233 static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
234 static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
236 static i128 __vectorcall NegateInt8(i128arg value)
NLIB_NOEXCEPT;
237 static i128 __vectorcall NegateInt16(i128arg value)
NLIB_NOEXCEPT;
238 static i128 __vectorcall NegateInt32(i128arg value)
NLIB_NOEXCEPT;
240 static i128 __vectorcall MaxInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
241 static i128 __vectorcall MaxInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
242 static i128 __vectorcall MaxInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
243 static i128 __vectorcall MaxUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
244 static i128 __vectorcall MaxUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
245 static i128 __vectorcall MaxUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
246 static i128 __vectorcall MinInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
247 static i128 __vectorcall MinInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
248 static i128 __vectorcall MinInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
249 static i128 __vectorcall MinUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
250 static i128 __vectorcall MinUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
251 static i128 __vectorcall MinUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
253 static i128 __vectorcall AbsInt8(i128arg value)
NLIB_NOEXCEPT;
254 static i128 __vectorcall AbsInt16(i128arg value)
NLIB_NOEXCEPT;
255 static i128 __vectorcall AbsInt32(i128arg value)
NLIB_NOEXCEPT;
256 static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
257 static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
258 static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
263 static i128 __vectorcall And(i128arg a, i128arg b)
NLIB_NOEXCEPT;
264 static i128 __vectorcall Or(i128arg a, i128arg b)
NLIB_NOEXCEPT;
265 static i128 __vectorcall Xor(i128arg a, i128arg b)
NLIB_NOEXCEPT;
267 static i128 __vectorcall AndNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
268 static i128 __vectorcall OrNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
269 static i128 __vectorcall Test8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
270 static i128 __vectorcall Test16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
271 static i128 __vectorcall Test32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
276 static i128 __vectorcall CmpEq8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
277 static i128 __vectorcall CmpEq16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
278 static i128 __vectorcall CmpEq32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
279 static i128 __vectorcall CmpEq64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
281 static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
282 static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
283 static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
284 static i128 __vectorcall CmpLtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
286 static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
287 static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
288 static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
289 static i128 __vectorcall CmpGtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
291 static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
292 static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
293 static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
294 static i128 __vectorcall CmpLtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
296 static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
297 static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
298 static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
299 static i128 __vectorcall CmpGtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
301 static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
302 static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
303 static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
304 static i128 __vectorcall CmpLeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
306 static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
307 static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
308 static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
309 static i128 __vectorcall CmpGeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
311 static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
312 static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
313 static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
314 static i128 __vectorcall CmpLeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
316 static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
317 static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
318 static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
319 static i128 __vectorcall CmpGeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
321 static i128 __vectorcall CmpEqZero8(i128arg value)
NLIB_NOEXCEPT;
322 static i128 __vectorcall CmpEqZero16(i128arg value)
NLIB_NOEXCEPT;
323 static i128 __vectorcall CmpEqZero32(i128arg value)
NLIB_NOEXCEPT;
324 static i128 __vectorcall CmpEqZero64(i128arg value)
NLIB_NOEXCEPT;
329 static i128 __vectorcall ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
330 static i128 __vectorcall ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
331 static i128 __vectorcall ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT;
333 static i128 __vectorcall ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
334 static i128 __vectorcall ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
335 static i128 __vectorcall ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT;
337 static i128 __vectorcall ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
338 static i128 __vectorcall ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
339 static i128 __vectorcall ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT;
341 static i128 __vectorcall ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
342 static i128 __vectorcall ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
348 static i128 __vectorcall ShiftLeftLogical8(i128arg value)
NLIB_NOEXCEPT;
350 static i128 __vectorcall ShiftRightLogical8(i128arg value)
NLIB_NOEXCEPT;
352 static i128 __vectorcall ShiftRightArithmetic8(i128arg value)
NLIB_NOEXCEPT;
355 static i128 __vectorcall ShiftLeftLogical16(i128arg value)
NLIB_NOEXCEPT;
357 static i128 __vectorcall ShiftRightLogical16(i128arg value)
NLIB_NOEXCEPT;
359 static i128 __vectorcall ShiftRightArithmetic16(i128arg value)
NLIB_NOEXCEPT;
362 static i128 __vectorcall ShiftLeftLogical32(i128arg value)
NLIB_NOEXCEPT;
364 static i128 __vectorcall ShiftRightLogical32(i128arg value)
NLIB_NOEXCEPT;
366 static i128 __vectorcall ShiftRightArithmetic32(i128arg value)
NLIB_NOEXCEPT;
369 static i128 __vectorcall ShiftLeftLogical64(i128arg value)
NLIB_NOEXCEPT;
371 static i128 __vectorcall ShiftRightLogical64(i128arg value)
NLIB_NOEXCEPT;
377 static i128 __vectorcall ByteShiftLeft(i128arg value)
NLIB_NOEXCEPT;
379 static i128 __vectorcall ByteShiftRight(i128arg value)
NLIB_NOEXCEPT;
381 static i128 __vectorcall ByteRotateRight(i128arg value)
NLIB_NOEXCEPT;
383 static i128 __vectorcall AlignR(i128arg a, i128arg b)
NLIB_NOEXCEPT;
388 static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
389 static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
390 static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
392 static i128 __vectorcall
393 ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
394 static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
395 static i128 __vectorcall
396 ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
397 static i128 __vectorcall
398 ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
400 static i128 __vectorcall ConvertFromInt8ToInt16Lo(i128arg value)
NLIB_NOEXCEPT;
401 static i128 __vectorcall ConvertFromInt8ToInt16Hi(i128arg value)
NLIB_NOEXCEPT;
402 static i128 __vectorcall ConvertFromInt16ToInt32Lo(i128arg value)
NLIB_NOEXCEPT;
403 static i128 __vectorcall ConvertFromInt16ToInt32Hi(i128arg value)
NLIB_NOEXCEPT;
404 static i128 __vectorcall ConvertFromInt32ToInt64Lo(i128arg value)
NLIB_NOEXCEPT;
405 static i128 __vectorcall ConvertFromInt32ToInt64Hi(i128arg value)
NLIB_NOEXCEPT;
406 static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value)
NLIB_NOEXCEPT;
407 static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value)
NLIB_NOEXCEPT;
408 static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value)
NLIB_NOEXCEPT;
409 static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value)
NLIB_NOEXCEPT;
410 static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value)
NLIB_NOEXCEPT;
411 static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value)
NLIB_NOEXCEPT;
413 static i128 __vectorcall Zip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
414 static i128 __vectorcall Zip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
415 static i128 __vectorcall Unzip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
416 static i128 __vectorcall Unzip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
417 static i128 __vectorcall Zip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
418 static i128 __vectorcall Zip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
419 static i128 __vectorcall Unzip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
420 static i128 __vectorcall Unzip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
421 static i128 __vectorcall Zip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
422 static i128 __vectorcall Zip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
423 static i128 __vectorcall Unzip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
424 static i128 __vectorcall Unzip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
426 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
427 int V8,
int V9,
int V10,
int V11,
int V12,
int V13,
int V14,
int V15>
428 static i128 __vectorcall Permute8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
429 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
430 static i128 __vectorcall Permute16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
431 template<
int V0,
int V1,
int V2,
int V3>
432 static i128 __vectorcall Permute32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
437 static i128 __vectorcall Reverse16(i128arg value)
NLIB_NOEXCEPT;
438 static i128 __vectorcall Reverse32(i128arg value)
NLIB_NOEXCEPT;
439 static i128 __vectorcall Reverse64(i128arg value)
NLIB_NOEXCEPT;
444 static int __vectorcall MoveMask8(i128arg value)
NLIB_NOEXCEPT;
445 static int __vectorcall MoveMask16(i128arg value)
NLIB_NOEXCEPT;
446 static int __vectorcall MoveMask32(i128arg value)
NLIB_NOEXCEPT;
450 static bool __vectorcall IsZero(i128arg value)
NLIB_NOEXCEPT;
451 static bool __vectorcall IsFull(i128arg value)
NLIB_NOEXCEPT;
452 static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT;
453 static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle)
NLIB_NOEXCEPT;
454 static int __vectorcall PopCntMask8(i128arg value)
NLIB_NOEXCEPT;
455 static int __vectorcall ClzMask8(i128arg value)
NLIB_NOEXCEPT;
456 static int __vectorcall CtzMask8(i128arg value)
NLIB_NOEXCEPT;
464 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 465 #define NLIB_M2(tp) inline tp __vectorcall 468 # undef vreinterpret_s8_s8 476 #define vreinterpretq_s8_s8(a) (a) 477 #define NLIB_OP1(intrin, tp, a) \ 478 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a))) 479 #define NLIB_OP2(intrin, tp, a, b) \ 480 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 481 vreinterpretq_##tp##_s8(b))) 482 #define NLIB_OP3(intrin, tp, a, b, c) \ 483 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 484 vreinterpretq_##tp##_s8(b), \ 485 vreinterpretq_##tp##_s8(c))) 486 #define NLIB_CMP(intrin, tp, a, b, utp) \ 487 vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 488 vreinterpretq_##tp##_s8(b))) 489 #define NLIB_SFT(intrin, tp, a, cnt, stp) \ 490 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt))) 491 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h)) 496 #if defined(NLIB_SSE41) 498 return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
499 #elif defined(NLIB_NEON) 500 return vdupq_n_s8(v);
506 #if defined(NLIB_SSE41) 507 return _mm_set1_epi16(v);
508 #elif defined(NLIB_NEON) 509 return vreinterpretq_s8_s16(vdupq_n_s16(v));
515 #if defined(NLIB_SSE41) 516 return _mm_set1_epi32(v);
517 #elif defined(NLIB_NEON) 518 return vreinterpretq_s8_s32(vdupq_n_s32(v));
524 #if defined(NLIB_SSE41) 530 return I128::LoadA16(tmp);
532 return _mm_set1_epi64x(v);
534 #elif defined(NLIB_NEON) 535 return vreinterpretq_s8_s64(vdupq_n_s64(v));
541 #if defined(NLIB_SSE41) 543 return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
544 #elif defined(NLIB_NEON) 545 return vreinterpretq_s8_u8(vdupq_n_u8(v));
551 #if defined(NLIB_SSE41) 552 return _mm_set1_epi16(static_cast<int16_t>(v));
553 #elif defined(NLIB_NEON) 554 return vreinterpretq_s8_u16(vdupq_n_u16(v));
560 #if defined(NLIB_SSE41) 561 return _mm_set1_epi32(static_cast<int32_t>(v));
562 #elif defined(NLIB_NEON) 563 return vreinterpretq_s8_u32(vdupq_n_u32(v));
569 #if defined(NLIB_SSE41) 572 return I128::LoadA16(tmp);
574 return _mm_set1_epi64x(static_cast<int64_t>(v));
576 #elif defined(NLIB_NEON) 577 return vreinterpretq_s8_u64(vdupq_n_u64(v));
581 #if defined(NLIB_SSE41) 586 return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
588 #elif defined(NLIB_NEON) 594 uint32x4_t v = vreinterpretq_u32_s8(value);
595 return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
600 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
601 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
605 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
606 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
610 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
611 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
615 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
616 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
621 #if defined(NLIB_SSE41) 627 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
628 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
630 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
632 #elif defined(NLIB_NEON) 636 uint16x8_t v = vreinterpretq_u16_s8(value);
637 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
639 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
640 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
647 uint16x8_t v = vreinterpretq_u16_s8(value);
648 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
650 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
651 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
658 uint16x8_t v = vreinterpretq_u16_s8(value);
659 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
661 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
662 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
669 uint16x8_t v = vreinterpretq_u16_s8(value);
670 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
672 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
673 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
680 uint16x8_t v = vreinterpretq_u16_s8(value);
681 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
683 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
684 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
691 uint16x8_t v = vreinterpretq_u16_s8(value);
692 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
694 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
695 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
702 uint16x8_t v = vreinterpretq_u16_s8(value);
703 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
705 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
706 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
713 uint16x8_t v = vreinterpretq_u16_s8(value);
714 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
716 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
717 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
722 #if defined(NLIB_SSE41) 728 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
730 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
732 #elif defined(NLIB_NEON) 734 template <
size_t N,
bool IsLower>
735 struct SetValue8Helper {
737 return vdupq_lane_s8(vget_low_s8(value), N);
742 struct SetValue8Helper<N, false> {
744 return vdupq_lane_s8(vget_high_s8(value), N - 8);
754 return vdupq_laneq_s8(value, N);
756 return detail::SetValue8Helper<N, (N < 8)>()(value);
763 #if defined(NLIB_SSE41) 764 return _mm_setzero_si128();
765 #elif defined(NLIB_NEON) 766 return vdupq_n_s8(0);
771 NLIB_M(i128) I128::SetFull(i128arg dummy)
NLIB_NOEXCEPT {
return I128::CmpEq8(dummy, dummy); }
775 #if defined(NLIB_SSE41) 776 return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
777 #elif defined(NLIB_NEON) 778 uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
779 return vreinterpretq_s8_u64(tmp);
785 #if defined(NLIB_SSE41) 786 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
787 #elif defined(NLIB_NEON) 788 uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
789 return vreinterpretq_s8_u64(tmp);
795 #if defined(NLIB_SSE41) 796 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
797 #elif defined(NLIB_NEON) 798 uint32x4_t tmp = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
799 return vreinterpretq_s8_u32(tmp);
805 #if defined(NLIB_SSE41) 806 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
807 #elif defined(NLIB_NEON) 808 uint16x8_t tmp = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
809 return vreinterpretq_s8_u16(tmp);
815 #if defined(NLIB_SSE41) 816 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
817 #elif defined(NLIB_NEON) 818 return vld1q_s8(reinterpret_cast<const int8_t*>(p));
823 #if defined(NLIB_SSE41) 824 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
825 #elif defined(NLIB_NEON) 826 int8x8_t lo = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
827 return vcombine_s8(lo, vdup_n_s8(0));
832 #if defined(NLIB_SSE41) 833 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
834 #elif defined(NLIB_NEON) 835 int8x8_t lo = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
836 return vcombine_s8(lo, vdup_n_s8(0));
841 #if defined(NLIB_SSE41) 842 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
843 #elif defined(NLIB_NEON) 844 int8x8_t lo = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
845 return vcombine_s8(lo, vdup_n_s8(0));
850 #if defined(NLIB_SSE41) 851 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
852 #elif defined(NLIB_NEON) 853 int8x8_t lo = vld1_s8(reinterpret_cast<const int8_t*>(p));
854 return vcombine_s8(lo, vdup_n_s8(0));
859 #if defined(NLIB_SSE41) 860 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
861 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
862 #elif defined(NLIB_NEON) 863 int8x8_t hi = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
864 return vcombine_s8(vdup_n_s8(0), hi);
869 #if defined(NLIB_SSE41) 870 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
871 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
872 #elif defined(NLIB_NEON) 873 int8x8_t hi = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
874 return vcombine_s8(vdup_n_s8(0), hi);
879 #if defined(NLIB_SSE41) 880 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
881 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
882 #elif defined(NLIB_NEON) 883 int8x8_t hi = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
884 return vcombine_s8(vdup_n_s8(0), hi);
889 #if defined(NLIB_SSE41) 890 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
891 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
892 #elif defined(NLIB_NEON) 893 int8x8_t hi = vld1_s8(reinterpret_cast<const int8_t*>(p));
894 return vcombine_s8(vdup_n_s8(0), hi);
899 NLIB_M(
void) I128::StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT {
900 #if defined(NLIB_SSE41) 901 _mm_store_si128(reinterpret_cast<i128*>(p), value);
902 #elif defined(NLIB_NEON) 903 vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
908 NLIB_M(
void) I128::StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
909 #if defined(NLIB_SSE41) 910 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
911 #elif defined(NLIB_NEON) 912 vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
917 NLIB_M(
void) I128::StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
918 #if defined(NLIB_SSE41) 919 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
920 #elif defined(NLIB_NEON) 921 vst1q_u32(reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
926 NLIB_M(
void) I128::StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
927 #if defined(NLIB_SSE41) 928 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
929 #elif defined(NLIB_NEON) 930 vst1q_u16(reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
935 NLIB_M(
void) I128::StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
936 #if defined(NLIB_SSE41) 937 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
938 #elif defined(NLIB_NEON) 939 vst1q_s8(reinterpret_cast<int8_t*>(p), value);
943 NLIB_M(
void) I128::StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
944 #if defined(NLIB_SSE41) 945 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
946 #elif defined(NLIB_NEON) 947 uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
948 vst1_u64(reinterpret_cast<uint64_t*>(p), x);
952 NLIB_M(
void) I128::StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
953 #if defined(NLIB_SSE41) 954 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
955 #elif defined(NLIB_NEON) 956 uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
957 vst1_u32(reinterpret_cast<uint32_t*>(p), x);
961 NLIB_M(
void) I128::StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
962 #if defined(NLIB_SSE41) 963 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
964 #elif defined(NLIB_NEON) 965 uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
966 vst1_u16(reinterpret_cast<uint16_t*>(p), x);
970 NLIB_M(
void) I128::StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
971 #if defined(NLIB_SSE41) 972 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
973 #elif defined(NLIB_NEON) 974 int8x8_t x = vget_low_s8(value);
975 vst1_s8(reinterpret_cast<int8_t*>(p), x);
979 NLIB_M(
void) I128::StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
980 #if defined(NLIB_SSE41) 981 _mm_storel_epi64(reinterpret_cast<i128*>(p),
982 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
983 #elif defined(NLIB_NEON) 984 uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
985 vst1_u64(reinterpret_cast<uint64_t*>(p), x);
989 NLIB_M(
void) I128::StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
990 #if defined(NLIB_SSE41) 991 _mm_storel_epi64(reinterpret_cast<i128*>(p),
992 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
993 #elif defined(NLIB_NEON) 994 uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
995 vst1_u32(reinterpret_cast<uint32_t*>(p), x);
999 NLIB_M(
void) I128::StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
1000 #if defined(NLIB_SSE41) 1001 _mm_storel_epi64(reinterpret_cast<i128*>(p),
1002 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1003 #elif defined(NLIB_NEON) 1004 uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
1005 vst1_u16(reinterpret_cast<uint16_t*>(p), x);
1009 NLIB_M(
void) I128::StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
1010 #if defined(NLIB_SSE41) 1011 _mm_storel_epi64(reinterpret_cast<i128*>(p),
1012 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1013 #elif defined(NLIB_NEON) 1014 int8x8_t x = vget_high_s8(value);
1015 vst1_s8(reinterpret_cast<int8_t*>(p), x);
1021 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT {
1023 #if defined(NLIB_SSE41) 1024 return static_cast<uint8_t
>(_mm_extract_epi8(value, N));
1025 #elif defined(NLIB_NEON) 1026 return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1032 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT {
1034 #if defined(NLIB_SSE41) 1035 return static_cast<uint16_t
>(_mm_extract_epi16(value, N));
1036 #elif defined(NLIB_NEON) 1037 return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1043 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT {
1045 #if defined(NLIB_SSE41) 1046 return static_cast<uint32_t
>(_mm_extract_epi32(value, N));
1047 #elif defined(NLIB_NEON) 1048 return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1054 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT {
1056 #if defined(NLIB_SSE41) 1058 return static_cast<uint64_t
>(_mm_extract_epi64(value, N));
1063 #elif defined(NLIB_NEON) 1064 return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1068 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT) 1070 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value)
NLIB_NOEXCEPT {
1072 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1076 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value)
NLIB_NOEXCEPT {
1078 i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1079 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1086 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v)
NLIB_NOEXCEPT {
1088 #if defined(NLIB_SSE41) 1089 return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1090 #elif defined(NLIB_NEON) 1091 return __builtin_constant_p(v) ?
1108 N == 15 ? 31 : 15>(value, vreinterpretq_s8_u8(vdupq_n_u8(v))) :
1109 vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1115 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT {
1117 #if defined(NLIB_SSE41) 1118 return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1119 #elif defined(NLIB_NEON) 1120 return __builtin_constant_p(v) ?
1129 N == 7 ? 15 : 7>(value, vreinterpretq_s8_u16(vdupq_n_u16(v))) :
1130 vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1136 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT {
1138 #if defined(NLIB_SSE41) 1139 return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1140 #elif defined(NLIB_NEON) 1141 return __builtin_constant_p(v) ?
1142 I128::Permute32<N == 0 ? 4 : 0,
1145 N == 3 ? 7 : 3>(value, vreinterpretq_s8_u32(vdupq_n_u32(v))) :
1146 vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1152 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT {
1154 #if defined(NLIB_SSE41) 1156 return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1162 tmp.i64 =
static_cast<int64_t
>(v);
1164 rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1165 return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1167 #elif defined(NLIB_NEON) 1168 return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1173 NLIB_M(i128) I128::Add8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1174 #if defined(NLIB_SSE41) 1175 return _mm_add_epi8(a, b);
1176 #elif defined(NLIB_NEON) 1177 return vaddq_s8(a, b);
1182 NLIB_M(i128) I128::Add16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1183 #if defined(NLIB_SSE41) 1184 return _mm_add_epi16(a, b);
1185 #elif defined(NLIB_NEON) 1186 return NLIB_OP2(vaddq, s16, a, b);
1191 NLIB_M(i128) I128::Add32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1192 #if defined(NLIB_SSE41) 1193 return _mm_add_epi32(a, b);
1194 #elif defined(NLIB_NEON) 1195 return NLIB_OP2(vaddq, s32, a, b);
1200 NLIB_M(i128) I128::Add64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1201 #if defined(NLIB_SSE41) 1202 return _mm_add_epi64(a, b);
1203 #elif defined(NLIB_NEON) 1204 return NLIB_OP2(vaddq, s64, a, b);
1209 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1210 #if defined(NLIB_SSE41) 1211 return _mm_adds_epi8(a, b);
1212 #elif defined(NLIB_NEON) 1213 return vqaddq_s8(a, b);
1218 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1219 #if defined(NLIB_SSE41) 1220 return _mm_adds_epi16(a, b);
1221 #elif defined(NLIB_NEON) 1222 return NLIB_OP2(vqaddq, s16, a, b);
1227 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1228 #if defined(NLIB_SSE41) 1229 return _mm_adds_epu8(a, b);
1230 #elif defined(NLIB_NEON) 1231 return NLIB_OP2(vqaddq, u8, a, b);
1236 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1237 #if defined(NLIB_SSE41) 1238 return _mm_adds_epu16(a, b);
1239 #elif defined(NLIB_NEON) 1240 return NLIB_OP2(vqaddq, u16, a, b);
1245 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1246 #if defined(NLIB_SSE41) 1247 return _mm_sub_epi8(a, b);
1248 #elif defined(NLIB_NEON) 1249 return vsubq_s8(a, b);
1254 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1255 #if defined(NLIB_SSE41) 1256 return _mm_sub_epi16(a, b);
1257 #elif defined(NLIB_NEON) 1258 return NLIB_OP2(vsubq, s16, a, b);
1263 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1264 #if defined(NLIB_SSE41) 1265 return _mm_sub_epi32(a, b);
1266 #elif defined(NLIB_NEON) 1267 return NLIB_OP2(vsubq, s32, a, b);
1272 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1273 #if defined(NLIB_SSE41) 1274 return _mm_sub_epi64(a, b);
1275 #elif defined(NLIB_NEON) 1276 return NLIB_OP2(vsubq, s64, a, b);
1281 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1282 #if defined(NLIB_SSE41) 1283 return _mm_subs_epi8(a, b);
1284 #elif defined(NLIB_NEON) 1285 return NLIB_OP2(vqsubq, s8, a, b);
1290 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1291 #if defined(NLIB_SSE41) 1292 return _mm_subs_epi16(a, b);
1293 #elif defined(NLIB_NEON) 1294 return NLIB_OP2(vqsubq, s16, a, b);
1299 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1300 #if defined(NLIB_SSE41) 1301 return _mm_subs_epu8(a, b);
1302 #elif defined(NLIB_NEON) 1303 return NLIB_OP2(vqsubq, u8, a, b);
1308 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1309 #if defined(NLIB_SSE41) 1310 return _mm_subs_epu16(a, b);
1311 #elif defined(NLIB_NEON) 1312 return NLIB_OP2(vqsubq, u16, a, b);
1317 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1318 #if defined(NLIB_SSE41) 1319 __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1320 __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1321 return I128::NarrowFrom16To8(ax, bx);
1322 #elif defined(NLIB_NEON) 1324 return vpaddq_s8(a, b);
1326 int8x8_t al = vget_low_s8(a);
1327 int8x8_t ah = vget_high_s8(a);
1328 int8x8_t rl = vpadd_s8(al, ah);
1329 int8x8_t bl = vget_low_s8(b);
1330 int8x8_t bh = vget_high_s8(b);
1331 int8x8_t rh = vpadd_s8(bl, bh);
1332 return vcombine_s8(rl, rh);
1338 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1339 #if defined(NLIB_SSE41) 1340 return _mm_hadd_epi16(a, b);
1341 #elif defined(NLIB_NEON) 1343 return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1345 int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1346 int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1347 int16x4_t rl = vpadd_s16(al, ah);
1348 int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1349 int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1350 int16x4_t rh = vpadd_s16(bl, bh);
1351 return NLIB_CMB(s16, rl, rh);
1357 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1358 #if defined(NLIB_SSE41) 1359 return _mm_hadd_epi32(a, b);
1360 #elif defined(NLIB_NEON) 1362 return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1364 int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1365 int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1366 int32x2_t rl = vpadd_s32(al, ah);
1367 int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1368 int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1369 int32x2_t rh = vpadd_s32(bl, bh);
1370 return NLIB_CMB(s32, rl, rh);
1376 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1377 #if defined(NLIB_SSE41) 1378 return _mm_mullo_epi16(a, b);
1379 #elif defined(NLIB_NEON) 1380 return NLIB_OP2(vmulq, s16, a, b);
1385 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1386 #if defined(NLIB_SSE41) 1387 return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1388 #elif defined(NLIB_NEON) 1389 return NLIB_OP3(vmlaq, s16, c, a, b);
1394 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1395 #if defined(NLIB_SSE41) 1396 return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1397 #elif defined(NLIB_NEON) 1398 return NLIB_OP3(vmlsq, s16, c, a, b);
1403 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1404 #if defined(NLIB_SSE41) 1405 return _mm_mullo_epi32(a, b);
1406 #elif defined(NLIB_NEON) 1407 return NLIB_OP2(vmulq, s32, a, b);
1412 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1413 #if defined(NLIB_SSE41) 1414 return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1415 #elif defined(NLIB_NEON) 1416 return NLIB_OP3(vmlaq, s32, c, a, b);
1421 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1422 #if defined(NLIB_SSE41) 1423 return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1424 #elif defined(NLIB_NEON) 1425 return NLIB_OP3(vmlsq, s32, c, a, b);
1430 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1431 #if defined(NLIB_SSE41) 1432 return _mm_max_epi8(a, b);
1433 #elif defined(NLIB_NEON) 1434 return NLIB_OP2(vmaxq, s8, a, b);
1439 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1440 #if defined(NLIB_SSE41) 1441 return _mm_max_epi16(a, b);
1442 #elif defined(NLIB_NEON) 1443 return NLIB_OP2(vmaxq, s16, a, b);
1448 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1449 #if defined(NLIB_SSE41) 1450 return _mm_max_epi32(a, b);
1451 #elif defined(NLIB_NEON) 1452 return NLIB_OP2(vmaxq, s32, a, b);
1457 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1458 #if defined(NLIB_SSE41) 1459 return _mm_max_epu8(a, b);
1460 #elif defined(NLIB_NEON) 1461 return NLIB_OP2(vmaxq, u8, a, b);
1466 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1467 #if defined(NLIB_SSE41) 1468 return _mm_max_epu16(a, b);
1469 #elif defined(NLIB_NEON) 1470 return NLIB_OP2(vmaxq, u16, a, b);
1475 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1476 #if defined(NLIB_SSE41) 1477 return _mm_max_epu32(a, b);
1478 #elif defined(NLIB_NEON) 1479 return NLIB_OP2(vmaxq, u32, a, b);
1484 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1485 #if defined(NLIB_SSE41) 1486 return _mm_min_epi8(a, b);
1487 #elif defined(NLIB_NEON) 1488 return NLIB_OP2(vminq, s8, a, b);
1493 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1494 #if defined(NLIB_SSE41) 1495 return _mm_min_epi16(a, b);
1496 #elif defined(NLIB_NEON) 1497 return NLIB_OP2(vminq, s16, a, b);
1502 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1503 #if defined(NLIB_SSE41) 1504 return _mm_min_epi32(a, b);
1505 #elif defined(NLIB_NEON) 1506 return NLIB_OP2(vminq, s32, a, b);
1511 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1512 #if defined(NLIB_SSE41) 1513 return _mm_min_epu8(a, b);
1514 #elif defined(NLIB_NEON) 1515 return NLIB_OP2(vminq, u8, a, b);
1520 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1521 #if defined(NLIB_SSE41) 1522 return _mm_min_epu16(a, b);
1523 #elif defined(NLIB_NEON) 1524 return NLIB_OP2(vminq, u16, a, b);
1529 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1530 #if defined(NLIB_SSE41) 1531 return _mm_min_epu32(a, b);
1532 #elif defined(NLIB_NEON) 1533 return NLIB_OP2(vminq, u32, a, b);
1539 #if defined(NLIB_SSE41) 1540 return _mm_abs_epi8(value);
1541 #elif defined(NLIB_NEON) 1542 return NLIB_OP1(vabsq, s8, value);
1548 #if defined(NLIB_SSE41) 1549 return _mm_abs_epi16(value);
1550 #elif defined(NLIB_NEON) 1551 return NLIB_OP1(vabsq, s16, value);
1557 #if defined(NLIB_SSE41) 1558 return _mm_abs_epi32(value);
1559 #elif defined(NLIB_NEON) 1560 return NLIB_OP1(vabsq, s32, value);
1565 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1566 #if defined(NLIB_SSE41) 1567 return _mm_abs_epi8(_mm_sub_epi8(a, b));
1568 #elif defined(NLIB_NEON) 1569 return NLIB_OP2(vabdq, s8, a, b);
1574 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1575 #if defined(NLIB_SSE41) 1576 return _mm_abs_epi16(_mm_sub_epi16(a, b));
1577 #elif defined(NLIB_NEON) 1578 return NLIB_OP2(vabdq, s16, a, b);
1583 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1584 #if defined(NLIB_SSE41) 1585 return _mm_abs_epi32(_mm_sub_epi32(a, b));
1586 #elif defined(NLIB_NEON) 1587 return NLIB_OP2(vabdq, s32, a, b);
1592 NLIB_M(i128) I128::NegateInt8(i128arg value)
NLIB_NOEXCEPT {
1593 #if defined(NLIB_SSE41) 1594 return _mm_sub_epi8(_mm_setzero_si128(), value);
1595 #elif defined(NLIB_NEON) 1596 return NLIB_OP1(vnegq, s8, value);
1601 NLIB_M(i128) I128::NegateInt16(i128arg value)
NLIB_NOEXCEPT {
1602 #if defined(NLIB_SSE41) 1603 return _mm_sub_epi16(_mm_setzero_si128(), value);
1604 #elif defined(NLIB_NEON) 1605 return NLIB_OP1(vnegq, s16, value);
1610 NLIB_M(i128) I128::NegateInt32(i128arg value)
NLIB_NOEXCEPT {
1611 #if defined(NLIB_SSE41) 1612 return _mm_sub_epi32(_mm_setzero_si128(), value);
1613 #elif defined(NLIB_NEON) 1614 return NLIB_OP1(vnegq, s32, value);
1619 NLIB_M(i128) I128::And(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1620 #if defined(NLIB_SSE41) 1621 return _mm_and_si128(a, b);
1622 #elif defined(NLIB_NEON) 1623 return NLIB_OP2(vandq, s8, a, b);
1629 #if defined(NLIB_SSE41) 1630 return _mm_or_si128(a, b);
1631 #elif defined(NLIB_NEON) 1632 return NLIB_OP2(vorrq, s8, a, b);
1637 NLIB_M(i128) I128::Xor(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1638 #if defined(NLIB_SSE41) 1639 return _mm_xor_si128(a, b);
1640 #elif defined(NLIB_NEON) 1641 return NLIB_OP2(veorq, s8, a, b);
1647 #if defined(NLIB_SSE41) 1648 return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1649 #elif defined(NLIB_NEON) 1650 return NLIB_OP1(vmvnq, s8, a);
1655 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1656 #if defined(NLIB_SSE41) 1657 return _mm_andnot_si128(a, b);
1658 #elif defined(NLIB_NEON) 1659 return NLIB_OP2(vbicq, s8, b, a);
1664 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1665 #if defined(NLIB_SSE41) 1666 __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1667 return _mm_or_si128(not_a, b);
1668 #elif defined(NLIB_NEON) 1669 return NLIB_OP2(vornq, s8, b, a);
1673 NLIB_M(i128) I128::Test8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1674 #if defined(NLIB_NEON) 1675 return vtstq_s8(a, b);
1677 return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1681 NLIB_M(i128) I128::Test16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1682 #if defined(NLIB_NEON) 1683 return NLIB_OP2(vtstq, s16, a, b);
1685 return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1689 NLIB_M(i128) I128::Test32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1690 #if defined(NLIB_NEON) 1691 return NLIB_OP2(vtstq, s32, a, b);
1693 return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1698 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1699 #if defined(NLIB_SSE41) 1700 return _mm_cmpeq_epi8(a, b);
1701 #elif defined(NLIB_NEON) 1702 return NLIB_CMP(vceqq, s8, a, b, u8);
1707 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1708 #if defined(NLIB_SSE41) 1709 return _mm_cmpeq_epi16(a, b);
1710 #elif defined(NLIB_NEON) 1711 return NLIB_CMP(vceqq, s16, a, b, u16);
1716 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1717 #if defined(NLIB_SSE41) 1718 return _mm_cmpeq_epi32(a, b);
1719 #elif defined(NLIB_NEON) 1720 return NLIB_CMP(vceqq, s32, a, b, u32);
1725 NLIB_M(i128) I128::CmpEq64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1726 #if defined(NLIB_SSE41) 1727 return _mm_cmpeq_epi64(a, b);
1728 #elif defined(NLIB_NEON) 1730 return NLIB_CMP(vceqq, s64, a, b, u64);
1732 uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1733 uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1734 uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1735 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1736 return vreinterpretq_s8_s64(result);
1742 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1743 #if defined(NLIB_SSE41) 1744 return _mm_cmplt_epi8(a, b);
1745 #elif defined(NLIB_NEON) 1746 return NLIB_CMP(vcltq, s8, a, b, u8);
1751 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1752 #if defined(NLIB_SSE41) 1753 return _mm_cmplt_epi16(a, b);
1754 #elif defined(NLIB_NEON) 1755 return NLIB_CMP(vcltq, s16, a, b, u16);
1760 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1761 #if defined(NLIB_SSE41) 1762 return _mm_cmplt_epi32(a, b);
1763 #elif defined(NLIB_NEON) 1764 return NLIB_CMP(vcltq, s32, a, b, u32);
1769 NLIB_M(i128) I128::CmpLtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1770 #if defined(NLIB_SSE42) 1771 return _mm_cmpgt_epi64(b, a);
1772 #elif defined(NLIB_NEON) 1774 return NLIB_CMP(vcltq, s64, a, b, u64);
1776 int32x2x2_t trn_a = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)),
1777 vreinterpret_s32_s8(vget_high_s8(a)));
1778 int32x2x2_t trn_b = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)),
1779 vreinterpret_s32_s8(vget_high_s8(b)));
1780 uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1781 uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1782 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1783 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1784 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1785 return vreinterpretq_s8_s64(result);
1788 i128 cmp = I128::CmpLtInt32(a, b);
1789 i128 eq = I128::CmpEq32(a, b);
1790 i128 cmp_lt = I128::CmpLtUint32(a, b);
1791 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1792 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1793 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1794 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1799 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1800 #if defined(NLIB_SSE41) 1801 return _mm_cmpgt_epi8(a, b);
1802 #elif defined(NLIB_NEON) 1803 return NLIB_CMP(vcgtq, s8, a, b, u8);
1808 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1809 #if defined(NLIB_SSE41) 1810 return _mm_cmpgt_epi16(a, b);
1811 #elif defined(NLIB_NEON) 1812 return NLIB_CMP(vcgtq, s16, a, b, u16);
1817 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1818 #if defined(NLIB_SSE41) 1819 return _mm_cmpgt_epi32(a, b);
1820 #elif defined(NLIB_NEON) 1821 return NLIB_CMP(vcgtq, s32, a, b, u32);
1826 NLIB_M(i128) I128::CmpGtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1827 #if defined(NLIB_SSE42) 1828 return _mm_cmpgt_epi64(a, b);
1829 #elif defined(NLIB_NEON) && defined(__aarch64__) 1830 return NLIB_CMP(vcgtq, s64, a, b, u64);
1832 return I128::CmpLtInt64(b, a);
1837 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1838 #if defined(NLIB_SSE41) 1839 i128 ofs = I128::SetValue(0x80, each_uint8);
1840 return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1841 #elif defined(NLIB_NEON) 1842 return NLIB_CMP(vcltq, u8, a, b, u8);
1847 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1848 #if defined(NLIB_SSE41) 1849 i128 ofs = I128::SetValue(0x80, each_uint8);
1850 return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1851 #elif defined(NLIB_NEON) 1852 return NLIB_CMP(vcgtq, u8, a, b, u8);
1857 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1858 #if defined(NLIB_SSE41) 1859 i128 ofs = I128::SetValue(0x8000U, each_uint16);
1860 return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1861 #elif defined(NLIB_NEON) 1862 return NLIB_CMP(vcltq, u16, a, b, u16);
1867 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1868 #if defined(NLIB_SSE41) 1869 i128 ofs = I128::SetValue(0x8000U, each_uint16);
1870 return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1871 #elif defined(NLIB_NEON) 1872 return NLIB_CMP(vcgtq, u16, a, b, u16);
1877 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1878 #if defined(NLIB_SSE41) 1879 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1880 return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1881 #elif defined(NLIB_NEON) 1882 return NLIB_CMP(vcltq, u32, a, b, u32);
1887 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1888 #if defined(NLIB_SSE41) 1889 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1890 return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1891 #elif defined(NLIB_NEON) 1892 return NLIB_CMP(vcgtq, u32, a, b, u32);
1897 NLIB_M(i128) I128::CmpLtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1898 #if defined(NLIB_SSE42) 1899 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1900 return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1901 #elif defined(NLIB_NEON) 1903 return NLIB_CMP(vcltq, u64, a, b, u64);
1905 uint32x2x2_t trn_a = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)),
1906 vreinterpret_u32_s8(vget_high_s8(a)));
1907 uint32x2x2_t trn_b = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)),
1908 vreinterpret_u32_s8(vget_high_s8(b)));
1909 uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1910 uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1911 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1912 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1913 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1914 return vreinterpretq_s8_s64(result);
1917 i128 cmp = I128::CmpLtUint32(a, b);
1918 i128 eq = I128::CmpEq32(a, b);
1919 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1920 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1921 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1922 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1927 NLIB_M(i128) I128::CmpGtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1928 #if defined(NLIB_SSE42) 1929 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1930 return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1931 #elif defined(NLIB_NEON) && defined(__aarch64__) 1932 return NLIB_CMP(vcgtq, u64, a, b, u64);
1934 return I128::CmpLtUint64(b, a);
1939 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1940 #if defined(NLIB_SSE41) 1941 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1942 #elif defined(NLIB_NEON) 1943 return NLIB_CMP(vcleq, s8, a, b, u8);
1948 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1949 #if defined(NLIB_SSE41) 1950 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1951 #elif defined(NLIB_NEON) 1952 return NLIB_CMP(vcleq, s16, a, b, u16);
1957 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1958 #if defined(NLIB_SSE41) 1959 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1960 #elif defined(NLIB_NEON) 1961 return NLIB_CMP(vcleq, s32, a, b, u32);
1966 NLIB_M(i128) I128::CmpLeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1967 #if defined(NLIB_SSE42) 1968 return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1969 #elif defined(NLIB_NEON) && defined(__aarch64__) 1970 return NLIB_CMP(vcleq, s64, a, b, u64);
1972 return I128::Not(I128::CmpGtInt64(a, b));
1977 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1978 #if defined(NLIB_SSE41) 1979 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1980 #elif defined(NLIB_NEON) 1981 return NLIB_CMP(vcgeq, s8, a, b, u8);
1986 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1987 #if defined(NLIB_SSE41) 1988 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1989 #elif defined(NLIB_NEON) 1990 return NLIB_CMP(vcgeq, s16, a, b, u16);
1995 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1996 #if defined(NLIB_SSE41) 1997 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1998 #elif defined(NLIB_NEON) 1999 return NLIB_CMP(vcgeq, s32, a, b, u32);
2004 NLIB_M(i128) I128::CmpGeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2005 #if defined(NLIB_SSE42) 2006 return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
2007 #elif defined(NLIB_NEON) && defined(__aarch64__) 2008 return NLIB_CMP(vcgeq, s64, a, b, u64);
2010 return I128::Not(I128::CmpLtInt64(a, b));
2015 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2016 #if defined(NLIB_SSE41) 2017 return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2018 #elif defined(NLIB_NEON) 2019 return NLIB_CMP(vcleq, u8, a, b, u8);
2024 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2025 #if defined(NLIB_SSE41) 2026 return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2027 #elif defined(NLIB_NEON) 2028 return NLIB_CMP(vcleq, u16, a, b, u16);
2033 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2034 #if defined(NLIB_SSE41) 2035 return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2036 #elif defined(NLIB_NEON) 2037 return NLIB_CMP(vcleq, u32, a, b, u32);
2042 NLIB_M(i128) I128::CmpLeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2043 #if defined(NLIB_SSE42) 2044 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2045 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2046 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2047 #elif defined(NLIB_NEON) && defined(__aarch64__) 2048 return NLIB_CMP(vcleq, u64, a, b, u64);
2050 return I128::Not(I128::CmpGtUint64(a, b));
2055 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2056 #if defined(NLIB_SSE41) 2057 return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2058 #elif defined(NLIB_NEON) 2059 return NLIB_CMP(vcgeq, u8, a, b, u8);
2064 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2065 #if defined(NLIB_SSE41) 2066 return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2067 #elif defined(NLIB_NEON) 2068 return NLIB_CMP(vcgeq, u16, a, b, u16);
2073 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2074 #if defined(NLIB_SSE41) 2075 return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2076 #elif defined(NLIB_NEON) 2077 return NLIB_CMP(vcgeq, u32, a, b, u32);
2082 NLIB_M(i128) I128::CmpGeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2083 #if defined(NLIB_SSE42) 2084 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2085 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2086 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2087 #elif defined(NLIB_NEON) && defined(__aarch64__) 2088 return NLIB_CMP(vcgeq, u64, a, b, u64);
2090 return I128::Not(I128::CmpLtUint64(a, b));
2094 NLIB_M(i128) I128::CmpEqZero8(i128arg value)
NLIB_NOEXCEPT {
2095 #if defined(__aarch64__) 2096 return vceqzq_s8(value);
2098 return I128::CmpEq8(value, I128::SetZero());
2102 NLIB_M(i128) I128::CmpEqZero16(i128arg value)
NLIB_NOEXCEPT {
2103 #if defined(__aarch64__) 2104 return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2106 return I128::CmpEq16(value, I128::SetZero());
2110 NLIB_M(i128) I128::CmpEqZero32(i128arg value)
NLIB_NOEXCEPT {
2111 #if defined(__aarch64__) 2112 return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2114 return I128::CmpEq32(value, I128::SetZero());
2118 NLIB_M(i128) I128::CmpEqZero64(i128arg value)
NLIB_NOEXCEPT {
2119 #if defined(__aarch64__) 2120 return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2122 return I128::CmpEq64(value, I128::SetZero());
2127 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT {
2128 #if defined(NLIB_SSE41) 2129 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2130 __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2131 __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2132 return I128::NarrowFrom16To8(xl, xh);
2133 #elif defined(NLIB_NEON) 2134 return NLIB_SFT(vshlq, u8, value, count, s8);
2139 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT {
2140 #if defined(NLIB_SSE41) 2141 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2142 __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2143 __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2144 return _mm_packus_epi16(xl, xh);
2145 #elif defined(NLIB_NEON) 2146 return NLIB_SFT(vshlq, u8, value, -count, s8);
2151 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT {
2152 #if defined(NLIB_SSE41) 2153 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2154 __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2155 __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2156 return _mm_packus_epi16(xl, xh);
2157 #elif defined(NLIB_NEON) 2158 return NLIB_SFT(vshlq, s8, value, -count, s8);
2163 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT {
2164 #if defined(NLIB_SSE41) 2165 return _mm_slli_epi16(value, count);
2166 #elif defined(NLIB_NEON) 2167 return NLIB_SFT(vshlq, u16, value, count, s16);
2172 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT {
2173 #if defined(NLIB_SSE41) 2174 return _mm_srli_epi16(value, count);
2175 #elif defined(NLIB_NEON) 2176 return NLIB_SFT(vshlq, u16, value, -count, s16);
2181 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT {
2182 #if defined(NLIB_SSE41) 2183 return _mm_srai_epi16(value, count);
2184 #elif defined(NLIB_NEON) 2185 return NLIB_SFT(vshlq, s16, value, -count, s16);
2190 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT {
2191 #if defined(NLIB_SSE41) 2192 return _mm_slli_epi32(value, count);
2193 #elif defined(NLIB_NEON) 2194 return NLIB_SFT(vshlq, u32, value, count, s32);
2199 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT {
2200 #if defined(NLIB_SSE41) 2201 return _mm_srli_epi32(value, count);
2202 #elif defined(NLIB_NEON) 2203 return NLIB_SFT(vshlq, u32, value, -count, s32);
2208 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT {
2209 #if defined(NLIB_SSE41) 2210 return _mm_srai_epi32(value, count);
2211 #elif defined(NLIB_NEON) 2212 return NLIB_SFT(vshlq, s32, value, -count, s32);
2217 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT {
2218 #if defined(NLIB_SSE41) 2219 return _mm_slli_epi64(value, count);
2220 #elif defined(NLIB_NEON) 2221 return NLIB_SFT(vshlq, u64, value, count, s64);
2226 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT {
2227 #if defined(NLIB_SSE41) 2228 return _mm_srli_epi64(value, count);
2229 #elif defined(NLIB_NEON) 2230 return NLIB_SFT(vshlq, u64, value, -count, s64);
2235 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value)
NLIB_NOEXCEPT {
2238 return vshlq_n_s8(value, N);
2240 return I128::ShiftLeftLogical8(value, N);
2245 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value)
NLIB_NOEXCEPT {
2248 uint8x16_t tmp = vreinterpretq_u8_s8(value);
2249 return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2251 return I128::ShiftRightLogical8(value, N);
2256 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value)
NLIB_NOEXCEPT {
2259 return vshrq_n_s8(value, N);
2261 return I128::ShiftRightArithmetic8(value, N);
2266 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value)
NLIB_NOEXCEPT {
2269 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2270 return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2272 return I128::ShiftLeftLogical16(value, N);
2277 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value)
NLIB_NOEXCEPT {
2280 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2281 return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2283 return I128::ShiftRightLogical16(value, N);
2288 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value)
NLIB_NOEXCEPT {
2291 int16x8_t tmp = vreinterpretq_s16_s8(value);
2292 return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2294 return I128::ShiftRightArithmetic16(value, N);
2299 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value)
NLIB_NOEXCEPT {
2302 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2303 return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2305 return I128::ShiftLeftLogical32(value, N);
2310 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value)
NLIB_NOEXCEPT {
2313 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2314 return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2316 return I128::ShiftRightLogical32(value, N);
2321 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value)
NLIB_NOEXCEPT {
2324 int32x4_t tmp = vreinterpretq_s32_s8(value);
2325 return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2327 return I128::ShiftRightArithmetic32(value, N);
2332 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value)
NLIB_NOEXCEPT {
2335 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2336 return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2338 return I128::ShiftLeftLogical64(value, N);
2343 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value)
NLIB_NOEXCEPT {
2346 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2347 return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2349 return I128::ShiftRightLogical64(value, N);
2355 NLIB_M(i128) I128::ShiftLeftLogical8<8>(i128arg value)
NLIB_NOEXCEPT {
2357 return I128::SetZero();
2360 NLIB_M(i128) I128::ShiftRightLogical8<0>(i128arg value)
NLIB_NOEXCEPT {
2364 NLIB_M(i128) I128::ShiftLeftLogical16<16>(i128arg value)
NLIB_NOEXCEPT {
2366 return I128::SetZero();
2369 NLIB_M(i128) I128::ShiftRightLogical16<0>(i128arg value)
NLIB_NOEXCEPT {
2373 NLIB_M(i128) I128::ShiftRightArithmetic16<0>(i128arg value)
NLIB_NOEXCEPT {
2377 NLIB_M(i128) I128::ShiftLeftLogical32<32>(i128arg value)
NLIB_NOEXCEPT {
2379 return I128::SetZero();
2382 NLIB_M(i128) I128::ShiftRightLogical32<0>(i128arg value)
NLIB_NOEXCEPT {
2386 NLIB_M(i128) I128::ShiftRightArithmetic32<0>(i128arg value)
NLIB_NOEXCEPT {
2390 NLIB_M(i128) I128::ShiftLeftLogical64<64>(i128arg value)
NLIB_NOEXCEPT {
2392 return I128::SetZero();
2395 NLIB_M(i128) I128::ShiftRightLogical64<0>(i128arg value)
NLIB_NOEXCEPT {
2402 NLIB_M(i128) I128::ByteShiftLeft(i128arg value)
NLIB_NOEXCEPT {
2404 #if defined(NLIB_SSE41) 2405 return _mm_slli_si128(value, N);
2406 #elif defined(NLIB_NEON) 2407 return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2413 NLIB_M(i128) I128::ByteShiftRight(i128arg value)
NLIB_NOEXCEPT {
2415 #if defined(NLIB_SSE41) 2416 return _mm_srli_si128(value, N);
2417 #elif defined(NLIB_NEON) 2418 return vextq_s8(value, vdupq_n_s8(0), N);
2424 NLIB_M(i128) I128::ByteRotateRight(i128arg value)
NLIB_NOEXCEPT {
2426 #if defined(NLIB_SSE41) 2427 return _mm_alignr_epi8(value, value, N);
2428 #elif defined(NLIB_NEON) 2429 return vextq_s8(value, value, N);
2435 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2437 #if defined(NLIB_SSE41) 2438 return _mm_alignr_epi8(a, b, N);
2439 #elif defined(NLIB_NEON) 2440 return vextq_s8(b, a, N);
2445 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2446 #if defined(NLIB_SSE41) 2447 i128 mask = I128::SetValue(0x00FFU, each_uint16);
2448 __m128i lo_mask = _mm_and_si128(lo, mask);
2449 __m128i hi_mask = _mm_and_si128(hi, mask);
2450 return _mm_packus_epi16(lo_mask, hi_mask);
2451 #elif defined(NLIB_NEON) 2453 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2454 return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2456 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2457 uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2458 return NLIB_CMB(u8, l, h);
2464 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2465 #if defined(NLIB_SSE41) 2466 i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2467 __m128i lo_mask = _mm_and_si128(lo, mask);
2468 __m128i hi_mask = _mm_and_si128(hi, mask);
2469 return _mm_packus_epi32(lo_mask, hi_mask);
2470 #elif defined(NLIB_NEON) 2472 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2473 return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2475 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2476 uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2477 return NLIB_CMB(u16, l, h);
2483 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2484 #if defined(NLIB_SSE41) 2485 __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2486 __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2487 return _mm_unpacklo_epi64(lo_, hi_);
2488 #elif defined(NLIB_NEON) 2490 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2491 return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2493 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2494 uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2495 return NLIB_CMB(u32, l, h);
2501 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2502 #if defined(NLIB_SSE41) 2503 i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
2504 __m128i lotmp = _mm_and_si128(lo, b7FFF);
2505 __m128i hitmp = _mm_and_si128(hi, b7FFF);
2506 return _mm_packus_epi16(lotmp, hitmp);
2507 #elif defined(NLIB_NEON) 2509 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2510 return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2512 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2513 uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2514 return NLIB_CMB(u8, l, h);
2520 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2521 #if defined(NLIB_SSE41) 2522 return _mm_packs_epi16(lo, hi);
2523 #elif defined(NLIB_NEON) 2525 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2526 return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2528 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2529 int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2530 return NLIB_CMB(s8, l, h);
2536 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2537 #if defined(NLIB_SSE41) 2538 i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
2539 __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2540 __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2541 return _mm_packus_epi32(lotmp, hitmp);
2542 #elif defined(NLIB_NEON) 2544 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2545 return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2547 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2548 uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2549 return NLIB_CMB(u16, l, h);
2555 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2556 #if defined(NLIB_SSE41) 2557 return _mm_packs_epi32(lo, hi);
2558 #elif defined(NLIB_NEON) 2560 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2561 return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2563 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2564 int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2565 return NLIB_CMB(s16, l, h);
2571 NLIB_M(i128) I128::ConvertFromInt8ToInt16Lo(i128arg value)
NLIB_NOEXCEPT {
2572 #if defined(NLIB_SSE41) 2573 return _mm_cvtepi8_epi16(value);
2574 #elif defined(NLIB_NEON) 2575 return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2580 NLIB_M(i128) I128::ConvertFromInt8ToInt16Hi(i128arg value)
NLIB_NOEXCEPT {
2581 #if defined(NLIB_SSE41) 2582 return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2583 #elif defined(NLIB_NEON) 2585 int16x8_t result = vmovl_high_s8(value);
2587 int16x8_t result = vmovl_s8(vget_high_s8(value));
2589 return vreinterpretq_s8_s16(result);
2594 NLIB_M(i128) I128::ConvertFromInt16ToInt32Lo(i128arg value)
NLIB_NOEXCEPT {
2595 #if defined(NLIB_SSE41) 2596 return _mm_cvtepi16_epi32(value);
2597 #elif defined(NLIB_NEON) 2598 int16x8_t x = vreinterpretq_s16_s8(value);
2599 int32x4_t result = vmovl_s16(vget_low_s16(x));
2600 return vreinterpretq_s8_s32(result);
2605 NLIB_M(i128) I128::ConvertFromInt16ToInt32Hi(i128arg value)
NLIB_NOEXCEPT {
2606 #if defined(NLIB_SSE41) 2607 return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2608 #elif defined(NLIB_NEON) 2609 int16x8_t x = vreinterpretq_s16_s8(value);
2611 int32x4_t result = vmovl_high_s16(x);
2613 int32x4_t result = vmovl_s16(vget_high_s16(x));
2615 return vreinterpretq_s8_s32(result);
2620 NLIB_M(i128) I128::ConvertFromInt32ToInt64Lo(i128arg value)
NLIB_NOEXCEPT {
2621 #if defined(NLIB_SSE41) 2622 return _mm_cvtepi32_epi64(value);
2623 #elif defined(NLIB_NEON) 2624 int32x4_t x = vreinterpretq_s32_s8(value);
2625 int64x2_t result = vmovl_s32(vget_low_s32(x));
2626 return vreinterpretq_s8_s64(result);
2631 NLIB_M(i128) I128::ConvertFromInt32ToInt64Hi(i128arg value)
NLIB_NOEXCEPT {
2632 #if defined(NLIB_SSE41) 2633 return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2634 #elif defined(NLIB_NEON) 2635 int32x4_t x = vreinterpretq_s32_s8(value);
2637 int64x2_t result = vmovl_high_s32(x);
2639 int64x2_t result = vmovl_s32(vget_high_s32(x));
2641 return vreinterpretq_s8_s64(result);
2646 NLIB_M(i128) I128::ConvertFromUint8ToUint16Lo(i128arg value)
NLIB_NOEXCEPT {
2647 #if defined(NLIB_SSE41) 2648 return _mm_cvtepu8_epi16(value);
2649 #elif defined(NLIB_NEON) 2650 uint8x16_t x = vreinterpretq_u8_s8(value);
2651 uint16x8_t result = vmovl_u8(vget_low_u8(x));
2652 return vreinterpretq_s8_u16(result);
2657 NLIB_M(i128) I128::ConvertFromUint8ToUint16Hi(i128arg value)
NLIB_NOEXCEPT {
2658 #if defined(NLIB_SSE41) 2659 return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2660 #elif defined(NLIB_NEON) 2661 uint8x16_t x = vreinterpretq_u8_s8(value);
2663 uint16x8_t result = vmovl_high_u8(x);
2665 uint16x8_t result = vmovl_u8(vget_high_u8(x));
2667 return vreinterpretq_s8_u16(result);
2672 NLIB_M(i128) I128::ConvertFromUint16ToUint32Lo(i128arg value)
NLIB_NOEXCEPT {
2673 #if defined(NLIB_SSE41) 2674 return _mm_cvtepu16_epi32(value);
2675 #elif defined(NLIB_NEON) 2676 uint16x8_t x = vreinterpretq_u16_s8(value);
2677 uint32x4_t result = vmovl_u16(vget_low_u16(x));
2678 return vreinterpretq_s8_u32(result);
2683 NLIB_M(i128) I128::ConvertFromUint16ToUint32Hi(i128arg value)
NLIB_NOEXCEPT {
2684 #if defined(NLIB_SSE41) 2685 return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2686 #elif defined(NLIB_NEON) 2687 uint16x8_t x = vreinterpretq_u16_s8(value);
2689 uint32x4_t result = vmovl_high_u16(x);
2691 uint32x4_t result = vmovl_u16(vget_high_u16(x));
2693 return vreinterpretq_s8_u32(result);
2698 NLIB_M(i128) I128::ConvertFromUint32ToUint64Lo(i128arg value)
NLIB_NOEXCEPT {
2699 #if defined(NLIB_SSE41) 2700 return _mm_cvtepu32_epi64(value);
2701 #elif defined(NLIB_NEON) 2702 uint32x4_t x = vreinterpretq_u32_s8(value);
2703 uint64x2_t result = vmovl_u32(vget_low_u32(x));
2704 return vreinterpretq_s8_u64(result);
2709 NLIB_M(i128) I128::ConvertFromUint32ToUint64Hi(i128arg value)
NLIB_NOEXCEPT {
2710 #if defined(NLIB_SSE41) 2711 return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2712 #elif defined(NLIB_NEON) 2713 uint32x4_t x = vreinterpretq_u32_s8(value);
2715 uint64x2_t result = vmovl_high_u32(x);
2717 uint64x2_t result = vmovl_u32(vget_high_u32(x));
2719 return vreinterpretq_s8_u64(result);
2724 NLIB_M(i128) I128::Zip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2725 #if defined(NLIB_SSE41) 2726 return _mm_unpacklo_epi8(a, b);
2727 #elif defined(NLIB_NEON) 2729 return vzip1q_s8(a, b);
2731 return vzipq_s8(a, b).val[0];
2737 NLIB_M(i128) I128::Zip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2738 #if defined(NLIB_SSE41) 2739 return _mm_unpackhi_epi8(a, b);
2740 #elif defined(NLIB_NEON) 2742 return vzip2q_s8(a, b);
2744 return vzipq_s8(a, b).val[1];
2749 NLIB_M(i128) I128::Unzip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2750 #if defined(NLIB_SSE41) 2751 i128 mask = I128::SetValue(0x00FFU, each_uint16);
2752 __m128i lo_mask = _mm_and_si128(a, mask);
2753 __m128i hi_mask = _mm_and_si128(b, mask);
2754 return _mm_packus_epi16(lo_mask, hi_mask);
2755 #elif defined(NLIB_NEON) 2757 return vuzp1q_s8(a, b);
2759 return vuzpq_s8(a, b).val[0];
2764 NLIB_M(i128) I128::Unzip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2765 #if defined(NLIB_SSE41) 2766 i128 mask = I128::SetValue(0xFF00U, each_uint16);
2767 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2768 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2769 return _mm_packus_epi16(lo_mask, hi_mask);
2770 #elif defined(NLIB_NEON) 2772 return vuzp2q_s8(a, b);
2774 return vuzpq_s8(a, b).val[1];
2780 NLIB_M(i128) I128::Zip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2781 #if defined(NLIB_SSE41) 2782 return _mm_unpacklo_epi16(a, b);
2783 #elif defined(NLIB_NEON) 2785 return NLIB_OP2(vzip1q, u16, a, b);
2787 return vreinterpretq_s8_u16(vzipq_u16(
2788 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2794 NLIB_M(i128) I128::Zip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2795 #if defined(NLIB_SSE41) 2796 return _mm_unpackhi_epi16(a, b);
2797 #elif defined(NLIB_NEON) 2799 return NLIB_OP2(vzip2q, u16, a, b);
2801 return vreinterpretq_s8_u16(vzipq_u16(
2802 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2807 NLIB_M(i128) I128::Unzip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2808 #if defined(NLIB_SSE41) 2809 i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2810 __m128i lo_mask = _mm_and_si128(a, mask);
2811 __m128i hi_mask = _mm_and_si128(b, mask);
2812 return _mm_packus_epi32(lo_mask, hi_mask);
2813 #elif defined(NLIB_NEON) 2815 return NLIB_OP2(vuzp1q, u16, a, b);
2817 return vreinterpretq_s8_u16(vuzpq_u16(
2818 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2823 NLIB_M(i128) I128::Unzip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2824 #if defined(NLIB_SSE41) 2825 i128 mask = I128::SetValue(0xFFFF0000U, each_uint32);
2826 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2827 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2828 return _mm_packus_epi32(lo_mask, hi_mask);
2829 #elif defined(NLIB_NEON) 2831 return NLIB_OP2(vuzp2q, u16, a, b);
2833 return vreinterpretq_s8_u16(vuzpq_u16(
2834 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2840 NLIB_M(i128) I128::Zip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2841 #if defined(NLIB_SSE41) 2842 return _mm_unpacklo_epi32(a, b);
2843 #elif defined(NLIB_NEON) 2845 return NLIB_OP2(vzip1q, u32, a, b);
2847 return vreinterpretq_s8_u32(vzipq_u32(
2848 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2854 NLIB_M(i128) I128::Zip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2855 #if defined(NLIB_SSE41) 2856 return _mm_unpackhi_epi32(a, b);
2857 #elif defined(NLIB_NEON) 2859 return NLIB_OP2(vzip2q, u32, a, b);
2861 return vreinterpretq_s8_u32(vzipq_u32(
2862 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2867 NLIB_M(i128) I128::Unzip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2868 #if defined(NLIB_SSE41) 2869 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2870 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2871 return _mm_blend_epi16(x0, x1, 0xF0);
2872 #elif defined(NLIB_NEON) 2874 return NLIB_OP2(vuzp1q, u32, a, b);
2876 return vreinterpretq_s8_u32(vuzpq_u32(
2877 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2882 NLIB_M(i128) I128::Unzip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2883 #if defined(NLIB_SSE41) 2884 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2885 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2886 return _mm_blend_epi16(x0, x1, 0xF0);
2887 #elif defined(NLIB_NEON) 2889 return NLIB_OP2(vuzp2q, u32, a, b);
2891 return vreinterpretq_s8_u32(vuzpq_u32(
2892 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2897 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
2898 int V8,
int V9,
int V10,
int V11,
int V12,
int V13,
int V14,
int V15>
2899 NLIB_M(i128) I128::Permute8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2900 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2901 return __builtin_shufflevector(
2903 V0, V1, V2, V3, V4, V5, V6, V7,
2904 V8, V9, V10, V11, V12, V13, V14, V15);
2905 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2906 return __builtin_shufflevector((__v16qi)a, (__v16qi)b,
2907 V0, V1, V2, V3, V4, V5, V6, V7,
2908 V8, V9, V10, V11, V12, V13, V14, V15);
2911 (V0 < 0 || V0 > 15) ? -128 : V0,
2912 (V1 < 0 || V1 > 15) ? -128 : V1,
2913 (V2 < 0 || V2 > 15) ? -128 : V2,
2914 (V3 < 0 || V3 > 15) ? -128 : V3,
2915 (V4 < 0 || V4 > 15) ? -128 : V4,
2916 (V5 < 0 || V5 > 15) ? -128 : V5,
2917 (V6 < 0 || V6 > 15) ? -128 : V6,
2918 (V7 < 0 || V7 > 15) ? -128 : V7,
2919 (V8 < 0 || V8 > 15) ? -128 : V8,
2920 (V9 < 0 || V9 > 15) ? -128 : V9,
2921 (V10 < 0 || V10 > 15) ? -128 : V10,
2922 (V11 < 0 || V11 > 15) ? -128 : V11,
2923 (V12 < 0 || V12 > 15) ? -128 : V12,
2924 (V13 < 0 || V13 > 15) ? -128 : V13,
2925 (V14 < 0 || V14 > 15) ? -128 : V14,
2926 (V15 < 0 || V15 > 15) ? -128 : V15
2929 V0 < 16 ? -128 : (V0 - 16),
2930 V1 < 16 ? -128 : (V1 - 16),
2931 V2 < 16 ? -128 : (V2 - 16),
2932 V3 < 16 ? -128 : (V3 - 16),
2933 V4 < 16 ? -128 : (V4 - 16),
2934 V5 < 16 ? -128 : (V5 - 16),
2935 V6 < 16 ? -128 : (V6 - 16),
2936 V7 < 16 ? -128 : (V7 - 16),
2937 V8 < 16 ? -128 : (V8 - 16),
2938 V9 < 16 ? -128 : (V9 - 16),
2939 V10 < 16 ? -128 : (V10 - 16),
2940 V11 < 16 ? -128 : (V11 - 16),
2941 V12 < 16 ? -128 : (V12 - 16),
2942 V13 < 16 ? -128 : (V13 - 16),
2943 V14 < 16 ? -128 : (V14 - 16),
2944 V15 < 16 ? -128 : (V15 - 16)
2946 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2947 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2948 return I128::Or(tmp_a, tmp_b);
2952 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
2953 NLIB_M(i128) I128::Permute16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2954 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2955 return vreinterpretq_s8_u16(__builtin_shufflevector(
2956 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b),
2957 V0, V1, V2, V3, V4, V5, V6, V7));
2958 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2959 return __builtin_shufflevector((__v8hi)a, (__v8hi)b,
2960 V0, V1, V2, V3, V4, V5, V6, V7);
2963 (V0 < 0 || V0 > 7) ? -128 : V0 * 2,
2964 (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2965 (V1 < 0 || V1 > 7) ? -128 : V1 * 2,
2966 (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2967 (V2 < 0 || V2 > 7) ? -128 : V2 * 2,
2968 (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2969 (V3 < 0 || V3 > 7) ? -128 : V3 * 2,
2970 (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2971 (V4 < 0 || V4 > 7) ? -128 : V4 * 2,
2972 (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2973 (V5 < 0 || V5 > 7) ? -128 : V5 * 2,
2974 (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2975 (V6 < 0 || V6 > 7) ? -128 : V6 * 2,
2976 (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2977 (V7 < 0 || V7 > 7) ? -128 : V7 * 2,
2978 (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1
2981 V0 < 8 ? -128 : (V0 - 8) * 2,
2982 V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2983 V1 < 8 ? -128 : (V1 - 8) * 2,
2984 V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2985 V2 < 8 ? -128 : (V2 - 8) * 2,
2986 V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2987 V3 < 8 ? -128 : (V3 - 8) * 2,
2988 V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2989 V4 < 8 ? -128 : (V4 - 8) * 2,
2990 V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2991 V5 < 8 ? -128 : (V5 - 8) * 2,
2992 V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2993 V6 < 8 ? -128 : (V6 - 8) * 2,
2994 V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2995 V7 < 8 ? -128 : (V7 - 8) * 2,
2996 V7 < 8 ? -128 : (V7 - 8) * 2 + 1
2998 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2999 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3000 return I128::Or(tmp_a, tmp_b);
3004 template<
int V0,
int V1,
int V2,
int V3>
3005 NLIB_M(i128) I128::Permute32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
3006 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 3007 return vreinterpretq_s8_u32(__builtin_shufflevector(
3008 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b),
3010 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 3011 return __builtin_shufflevector((__v4si)a, (__v4si)b,
3015 (V0 < 0 || V0 > 3) ? -128 : V0 * 4,
3016 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
3017 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2,
3018 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
3019 (V1 < 0 || V1 > 3) ? -128 : V1 * 4,
3020 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
3021 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2,
3022 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
3023 (V2 < 0 || V2 > 3) ? -128 : V2 * 4,
3024 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
3025 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2,
3026 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
3027 (V3 < 0 || V3 > 3) ? -128 : V3 * 4,
3028 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
3029 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2,
3030 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3
3033 V0 < 4 ? -128 : (V0 - 4) * 4,
3034 V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
3035 V0 < 4 ? -128 : (V0 - 4) * 4 + 2,
3036 V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
3037 V1 < 4 ? -128 : (V1 - 4) * 4,
3038 V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
3039 V1 < 4 ? -128 : (V1 - 4) * 4 + 2,
3040 V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
3041 V2 < 4 ? -128 : (V2 - 4) * 4,
3042 V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
3043 V2 < 4 ? -128 : (V2 - 4) * 4 + 2,
3044 V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
3045 V3 < 4 ? -128 : (V3 - 4) * 4,
3046 V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
3047 V3 < 4 ? -128 : (V3 - 4) * 4 + 2,
3048 V3 < 4 ? -128 : (V3 - 4) * 4 + 3
3050 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
3051 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3052 return I128::Or(tmp_a, tmp_b);
3059 #if defined(NLIB_SSE41) 3061 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3063 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3064 #elif defined(NLIB_NEON) 3065 return NLIB_OP1(vrev16q, u8, value);
3071 #if defined(NLIB_SSE41) 3073 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3075 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3076 #elif defined(NLIB_NEON) 3077 return NLIB_OP1(vrev32q, u8, value);
3083 #if defined(NLIB_SSE41) 3085 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3087 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3088 #elif defined(NLIB_NEON) 3089 return NLIB_OP1(vrev64q, u8, value);
3095 #if defined(NLIB_SSE41) 3096 return _mm_movemask_epi8(value);
3097 #elif defined(NLIB_NEON) 3098 uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3099 uint8x16_t a = vandq_u8(value, powers);
3101 return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3103 uint8x8_t al = vget_low_u8(a);
3104 uint8x8_t ah = vget_high_u8(a);
3105 uint8x8_t tmp = vpadd_u8(al, ah);
3106 tmp = vpadd_u8(tmp, tmp);
3107 tmp = vpadd_u8(tmp, tmp);
3108 return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3115 #if defined(NLIB_SSE41) 3116 __m128i tmp = _mm_packs_epi16(value, value);
3117 return _mm_movemask_epi8(tmp) & 255;
3118 #elif defined(NLIB_NEON) 3119 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3120 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3121 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3122 uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3124 return vaddvq_u16(a);
3126 uint8x8_t tmp = vmovn_u16(a);
3127 tmp = vpadd_u8(tmp, tmp);
3128 tmp = vpadd_u8(tmp, tmp);
3129 tmp = vpadd_u8(tmp, tmp);
3130 return vget_lane_u8(tmp, 0);
3137 #if defined(NLIB_SSE41) 3138 __m128i tmp = _mm_packs_epi16(value, value);
3139 tmp = _mm_packs_epi16(tmp, tmp);
3140 return _mm_movemask_epi8(tmp) & 15;
3141 #elif defined(NLIB_NEON) 3142 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3143 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3144 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3145 uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3147 return vaddvq_u32(a);
3149 uint16x4_t tmp = vmovn_u32(a);
3150 tmp = vpadd_u16(tmp, tmp);
3151 tmp = vpadd_u16(tmp, tmp);
3152 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3158 #if defined(NLIB_NEON) 3159 int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3160 int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3161 int8x8_t s1 = vdup_n_s8(mask >> 8);
3162 return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3163 #elif defined(NLIB_SSE41) 3164 i128 m = I128::SetValue(0x8040201008040201ULL, each_uint64);
3165 i128 s0 = I128::SetValue(mask & 0xFF, each_int8);
3166 i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8), each_int8);
3167 i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3168 return I128::Test8(m, s);
3173 #if defined(NLIB_NEON) 3174 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3175 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3176 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3177 uint16x8_t s = vdupq_n_u16(mask);
3178 return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3179 #elif defined(NLIB_SSE41) 3180 i128 m0 = I128::SetValue(0x0008000400020001ULL, each_uint64);
3181 i128 m1 = I128::SetValue(0x0080004000200010ULL, each_uint64);
3182 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3183 i128 s = I128::SetValue(static_cast<int16_t>(mask), each_int16);
3184 return I128::Test16(m, s);
3189 #if defined(NLIB_NEON) 3190 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3191 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3192 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3193 uint32x4_t s = vdupq_n_u32(mask);
3194 return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3195 #elif defined(NLIB_SSE41) 3196 i128 m0 = I128::SetValue(0x0000000200000001ULL, each_uint64);
3197 i128 m1 = I128::SetValue(0x0000000800000004ULL, each_uint64);
3198 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3199 i128 s = I128::SetValue(mask, each_int32);
3200 return I128::Test32(m, s);
3206 #if defined(NLIB_SSE41) 3207 return _mm_testz_si128(value, value) != 0;
3208 #elif defined(NLIB_NEON) 3210 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3211 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3215 int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3216 return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3223 #if defined(NLIB_SSE41) 3224 return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3225 #elif defined(NLIB_NEON) 3227 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3228 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3230 int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3231 return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3237 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT {
3238 #if defined(NLIB_SSE41) 3239 return _mm_blendv_epi8(b, a, mask);
3240 #elif defined(NLIB_NEON) 3241 return NLIB_OP3(vbslq, u32, mask, a, b);
3246 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle)
NLIB_NOEXCEPT {
3247 #if defined(NLIB_SSE41) 3248 return _mm_shuffle_epi8(value, shuffle);
3249 #elif defined(NLIB_NEON) 3251 return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3254 x.val[0] = vget_low_s8(value);
3255 x.val[1] = vget_high_s8(value);
3256 int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3257 int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3258 return vcombine_s8(lo, hi);
3265 #if defined(NLIB_NEON) 3267 int8x16_t tmp = vnegq_s8(value);
3268 return vaddvq_s8(tmp);
3270 int8x16_t tmp = vnegq_s8(value);
3271 int8x8_t lo = vget_low_s8(tmp);
3272 int8x8_t hi = vget_high_s8(tmp);
3273 lo = vadd_s8(lo, hi);
3274 lo = vpadd_s8(lo, lo);
3275 lo = vpadd_s8(lo, lo);
3276 lo = vpadd_s8(lo, lo);
3277 return vget_lane_s8(lo, 0);
3280 return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3285 return nlib_clz(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3289 return nlib_ctz(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3293 # undef vreinterpretq_s8_s8 3302 #endif // NLIB_DOXYGEN 3307 #if defined(NLIB_SSE41) 3308 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3310 row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \ 3311 row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \ 3312 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \ 3313 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \ 3314 __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \ 3315 __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \ 3316 __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \ 3317 __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \ 3318 row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \ 3319 row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \ 3320 row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \ 3321 row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \ 3323 #elif defined(NLIB_NEON) 3325 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3327 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \ 3328 vreinterpretq_u32_s8(row1)); \ 3329 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \ 3330 vreinterpretq_u32_s8(row3)); \ 3331 uint64x2_t row0_, row1_, row2_, row3_; \ 3332 row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3333 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3334 row0 = vreinterpretq_s8_u64(row0_); \ 3335 row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3336 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3337 row1 = vreinterpretq_s8_u64(row1_); \ 3338 row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3339 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3340 row2 = vreinterpretq_s8_u64(row2_); \ 3341 row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3342 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3343 row3 = vreinterpretq_s8_u64(row3_); \ 3346 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3348 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \ 3349 vreinterpretq_u32_s8(row1)); \ 3350 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \ 3351 vreinterpretq_u32_s8(row3)); \ 3352 uint32x4_t row0_, row1_, row2_, row3_; \ 3353 uint32x2_t lo, hi; \ 3354 lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \ 3355 row0_ = vcombine_u32(lo, hi); \ 3356 row0 = vreinterpretq_s8_u32(row0_); \ 3357 lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \ 3358 row1_ = vcombine_u32(lo, hi); \ 3359 row1 = vreinterpretq_s8_u32(row1_); \ 3360 lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \ 3361 row2_ = vcombine_u32(lo, hi); \ 3362 row2 = vreinterpretq_s8_u32(row2_); \ 3363 lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \ 3364 row3_ = vcombine_u32(lo, hi); \ 3365 row3 = vreinterpretq_s8_u32(row3_); \ 3375 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 空の構造体で32bitの符号付き整数を示すためのタグです。
空の構造体で64bitの符号付き整数を示すためのタグです。
constexpr const each_uint8_tag each_uint8
each_uint8_tag型の定数オブジェクトで、8bitの符号なし整数を示すためのタグです。
空の構造体で8bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で8bitの符号付き整数を示すためのタグです。
constexpr const each_uint16_tag each_uint16
each_uint16_tag型の定数オブジェクトで、16bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号なし整数を示すためのタグです。
空の構造体で64bitの符号なし整数を示すためのタグです。
constexpr const each_int64_tag each_int64
each_int64_tag型の定数オブジェクトで、64bitの符号付き整数を示すためのタグです。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
constexpr const each_uint64_tag each_uint64
each_uint64_tag型の定数オブジェクトで、64bitの符号なし整数を示すためのタグです。
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いた整数SIMD演算を行うためのクラスです。 ...
空の構造体で8bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号付き整数を示すためのタグです。
constexpr const each_int16_tag each_int16
each_int16_tag型の定数オブジェクトで、16bitの符号付き整数を示すためのタグです。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
空の構造体で16bit単位に分けたレーンを選択することを示すためのタグです。
constexpr const each_select16_tag each_select16
each_select16_tag型の定数オブジェクトで、16bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
__m128i nlib_i128_t
128bitの整数用SIMDレジスタのための型です。
constexpr const each_select8_tag each_select8
each_select8_tag型の定数オブジェクトで、8bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
空の構造体で32bitの符号なし整数を示すためのタグです。
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。