3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 4 #define INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 8 #if defined(NLIB_SSE41) 10 #elif defined(NLIB_NEON) 54 #if !defined(_MSC_VER) || _MSC_VER < 1800 60 #if defined(NLIB_SIMD) 66 typedef const i128& i128arg;
68 typedef const i128 i128arg;
89 static i128 __vectorcall SetFull(i128arg dummy)
NLIB_NOEXCEPT;
91 static i128 __vectorcall LoadA16(
const void* p)
NLIB_NOEXCEPT;
96 static i128 __vectorcall LoadLoA8(
const void* p)
NLIB_NOEXCEPT;
97 static i128 __vectorcall LoadLoA4(
const void* p)
NLIB_NOEXCEPT;
98 static i128 __vectorcall LoadLoA2(
const void* p)
NLIB_NOEXCEPT;
99 static i128 __vectorcall LoadLoA1(
const void* p)
NLIB_NOEXCEPT;
100 static i128 __vectorcall LoadHiA8(
const void* p)
NLIB_NOEXCEPT;
101 static i128 __vectorcall LoadHiA4(
const void* p)
NLIB_NOEXCEPT;
102 static i128 __vectorcall LoadHiA2(
const void* p)
NLIB_NOEXCEPT;
103 static i128 __vectorcall LoadHiA1(
const void* p)
NLIB_NOEXCEPT;
105 #define NLIB_LOAD_REDIRECT(func) \ 106 static i128 __vectorcall func(uintptr_t p) NLIB_NOEXCEPT { \ 107 return func(reinterpret_cast<void*>(p)); \ 109 static i128 __vectorcall func(intptr_t p) NLIB_NOEXCEPT { \ 110 return func(reinterpret_cast<void*>(p)); \ 112 NLIB_LOAD_REDIRECT(LoadA16)
113 NLIB_LOAD_REDIRECT(LoadA8)
114 NLIB_LOAD_REDIRECT(LoadA4)
115 NLIB_LOAD_REDIRECT(LoadA2)
116 NLIB_LOAD_REDIRECT(LoadA1)
117 NLIB_LOAD_REDIRECT(LoadLoA8)
118 NLIB_LOAD_REDIRECT(LoadLoA4)
119 NLIB_LOAD_REDIRECT(LoadLoA2)
120 NLIB_LOAD_REDIRECT(LoadLoA1)
121 NLIB_LOAD_REDIRECT(LoadHiA8)
122 NLIB_LOAD_REDIRECT(LoadHiA4)
123 NLIB_LOAD_REDIRECT(LoadHiA2)
124 NLIB_LOAD_REDIRECT(LoadHiA1)
125 #undef NLIB_LOAD_REDIRECT 127 static void __vectorcall StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT;
128 static void __vectorcall StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
129 static void __vectorcall StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
130 static void __vectorcall StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
131 static void __vectorcall StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
132 static void __vectorcall StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
133 static void __vectorcall StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
134 static void __vectorcall StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
135 static void __vectorcall StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
136 static void __vectorcall StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT;
137 static void __vectorcall StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT;
138 static void __vectorcall StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT;
139 static void __vectorcall StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT;
141 #define NLIB_STORE_REDIRECT(func) \ 142 static void __vectorcall func(uintptr_t p, i128arg value) NLIB_NOEXCEPT { \ 143 func(reinterpret_cast<void*>(p), value); \ 145 static void __vectorcall func(intptr_t p, i128arg value) NLIB_NOEXCEPT { \ 146 func(reinterpret_cast<void*>(p), value); \ 148 NLIB_STORE_REDIRECT(StoreA16)
149 NLIB_STORE_REDIRECT(StoreA8)
150 NLIB_STORE_REDIRECT(StoreA4)
151 NLIB_STORE_REDIRECT(StoreA2)
152 NLIB_STORE_REDIRECT(StoreA1)
153 NLIB_STORE_REDIRECT(StoreLoA8)
154 NLIB_STORE_REDIRECT(StoreLoA4)
155 NLIB_STORE_REDIRECT(StoreLoA2)
156 NLIB_STORE_REDIRECT(StoreLoA1)
157 NLIB_STORE_REDIRECT(StoreHiA8)
158 NLIB_STORE_REDIRECT(StoreHiA4)
159 NLIB_STORE_REDIRECT(StoreHiA2)
160 NLIB_STORE_REDIRECT(StoreHiA1)
161 #undef NLIB_STORE_REDIRECT 167 static uint8_t __vectorcall GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT;
169 static uint16_t __vectorcall GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT;
171 static uint32_t __vectorcall GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT;
173 static uint64_t __vectorcall GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT;
175 static i128 __vectorcall SetUint8ToLane(i128arg value, uint8_t v)
NLIB_NOEXCEPT;
177 static i128 __vectorcall SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT;
179 static i128 __vectorcall SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT;
181 static i128 __vectorcall SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT;
186 static i128 __vectorcall Add8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
187 static i128 __vectorcall Add16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
188 static i128 __vectorcall Add32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
189 static i128 __vectorcall Add64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
191 static i128 __vectorcall AddInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
192 static i128 __vectorcall AddInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
194 static i128 __vectorcall AddUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
195 static i128 __vectorcall AddUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
197 static i128 __vectorcall Sub8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
198 static i128 __vectorcall Sub16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
199 static i128 __vectorcall Sub32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
200 static i128 __vectorcall Sub64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
202 static i128 __vectorcall SubInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
203 static i128 __vectorcall SubInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
205 static i128 __vectorcall SubUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
206 static i128 __vectorcall SubUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT;
208 static i128 __vectorcall PairwiseAdd8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
209 static i128 __vectorcall PairwiseAdd16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
210 static i128 __vectorcall PairwiseAdd32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
216 static i128 __vectorcall Mult16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
217 static i128 __vectorcall MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
218 static i128 __vectorcall MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
219 static i128 __vectorcall Mult32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
220 static i128 __vectorcall MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
221 static i128 __vectorcall MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT;
223 static i128 __vectorcall NegateInt8(i128arg value)
NLIB_NOEXCEPT;
224 static i128 __vectorcall NegateInt16(i128arg value)
NLIB_NOEXCEPT;
225 static i128 __vectorcall NegateInt32(i128arg value)
NLIB_NOEXCEPT;
227 static i128 __vectorcall MaxInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
228 static i128 __vectorcall MaxInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
229 static i128 __vectorcall MaxInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
230 static i128 __vectorcall MaxUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
231 static i128 __vectorcall MaxUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
232 static i128 __vectorcall MaxUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
233 static i128 __vectorcall MinInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
234 static i128 __vectorcall MinInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
235 static i128 __vectorcall MinInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
236 static i128 __vectorcall MinUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
237 static i128 __vectorcall MinUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
238 static i128 __vectorcall MinUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
240 static i128 __vectorcall AbsInt8(i128arg value)
NLIB_NOEXCEPT;
241 static i128 __vectorcall AbsInt16(i128arg value)
NLIB_NOEXCEPT;
242 static i128 __vectorcall AbsInt32(i128arg value)
NLIB_NOEXCEPT;
243 static i128 __vectorcall AbsDiffInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
244 static i128 __vectorcall AbsDiffInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
245 static i128 __vectorcall AbsDiffInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
250 static i128 __vectorcall And(i128arg a, i128arg b)
NLIB_NOEXCEPT;
251 static i128 __vectorcall Or(i128arg a, i128arg b)
NLIB_NOEXCEPT;
252 static i128 __vectorcall Xor(i128arg a, i128arg b)
NLIB_NOEXCEPT;
254 static i128 __vectorcall AndNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
255 static i128 __vectorcall OrNot(i128arg a, i128arg b)
NLIB_NOEXCEPT;
256 static i128 __vectorcall Test8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
257 static i128 __vectorcall Test16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
258 static i128 __vectorcall Test32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
263 static i128 __vectorcall CmpEq8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
264 static i128 __vectorcall CmpEq16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
265 static i128 __vectorcall CmpEq32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
266 static i128 __vectorcall CmpEq64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
268 static i128 __vectorcall CmpLtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
269 static i128 __vectorcall CmpLtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
270 static i128 __vectorcall CmpLtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
271 static i128 __vectorcall CmpLtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
273 static i128 __vectorcall CmpGtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
274 static i128 __vectorcall CmpGtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
275 static i128 __vectorcall CmpGtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
276 static i128 __vectorcall CmpGtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
278 static i128 __vectorcall CmpLtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
279 static i128 __vectorcall CmpLtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
280 static i128 __vectorcall CmpLtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
281 static i128 __vectorcall CmpLtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
283 static i128 __vectorcall CmpGtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
284 static i128 __vectorcall CmpGtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
285 static i128 __vectorcall CmpGtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
286 static i128 __vectorcall CmpGtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
288 static i128 __vectorcall CmpLeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
289 static i128 __vectorcall CmpLeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
290 static i128 __vectorcall CmpLeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
291 static i128 __vectorcall CmpLeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
293 static i128 __vectorcall CmpGeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
294 static i128 __vectorcall CmpGeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
295 static i128 __vectorcall CmpGeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
296 static i128 __vectorcall CmpGeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
298 static i128 __vectorcall CmpLeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
299 static i128 __vectorcall CmpLeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
300 static i128 __vectorcall CmpLeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
301 static i128 __vectorcall CmpLeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
303 static i128 __vectorcall CmpGeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
304 static i128 __vectorcall CmpGeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
305 static i128 __vectorcall CmpGeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
306 static i128 __vectorcall CmpGeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT;
308 static i128 __vectorcall CmpEqZero8(i128arg value)
NLIB_NOEXCEPT;
309 static i128 __vectorcall CmpEqZero16(i128arg value)
NLIB_NOEXCEPT;
310 static i128 __vectorcall CmpEqZero32(i128arg value)
NLIB_NOEXCEPT;
311 static i128 __vectorcall CmpEqZero64(i128arg value)
NLIB_NOEXCEPT;
316 static i128 __vectorcall ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
317 static i128 __vectorcall ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT;
318 static i128 __vectorcall ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT;
320 static i128 __vectorcall ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
321 static i128 __vectorcall ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT;
322 static i128 __vectorcall ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT;
324 static i128 __vectorcall ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
325 static i128 __vectorcall ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT;
326 static i128 __vectorcall ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT;
328 static i128 __vectorcall ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
329 static i128 __vectorcall ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT;
335 static i128 __vectorcall ShiftLeftLogical8(i128arg value)
NLIB_NOEXCEPT;
337 static i128 __vectorcall ShiftRightLogical8(i128arg value)
NLIB_NOEXCEPT;
339 static i128 __vectorcall ShiftRightArithmetic8(i128arg value)
NLIB_NOEXCEPT;
342 static i128 __vectorcall ShiftLeftLogical16(i128arg value)
NLIB_NOEXCEPT;
344 static i128 __vectorcall ShiftRightLogical16(i128arg value)
NLIB_NOEXCEPT;
346 static i128 __vectorcall ShiftRightArithmetic16(i128arg value)
NLIB_NOEXCEPT;
349 static i128 __vectorcall ShiftLeftLogical32(i128arg value)
NLIB_NOEXCEPT;
351 static i128 __vectorcall ShiftRightLogical32(i128arg value)
NLIB_NOEXCEPT;
353 static i128 __vectorcall ShiftRightArithmetic32(i128arg value)
NLIB_NOEXCEPT;
356 static i128 __vectorcall ShiftLeftLogical64(i128arg value)
NLIB_NOEXCEPT;
358 static i128 __vectorcall ShiftRightLogical64(i128arg value)
NLIB_NOEXCEPT;
364 static i128 __vectorcall ByteShiftLeft(i128arg value)
NLIB_NOEXCEPT;
366 static i128 __vectorcall ByteShiftRight(i128arg value)
NLIB_NOEXCEPT;
368 static i128 __vectorcall ByteRotateRight(i128arg value)
NLIB_NOEXCEPT;
370 static i128 __vectorcall AlignR(i128arg a, i128arg b)
NLIB_NOEXCEPT;
375 static i128 __vectorcall NarrowFrom16To8(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
376 static i128 __vectorcall NarrowFrom32To16(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
377 static i128 __vectorcall NarrowFrom64To32(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
379 static i128 __vectorcall
380 ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
381 static i128 __vectorcall ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
382 static i128 __vectorcall
383 ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
384 static i128 __vectorcall
385 ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT;
387 static i128 __vectorcall ConvertFromInt8ToInt16Lo(i128arg value)
NLIB_NOEXCEPT;
388 static i128 __vectorcall ConvertFromInt8ToInt16Hi(i128arg value)
NLIB_NOEXCEPT;
389 static i128 __vectorcall ConvertFromInt16ToInt32Lo(i128arg value)
NLIB_NOEXCEPT;
390 static i128 __vectorcall ConvertFromInt16ToInt32Hi(i128arg value)
NLIB_NOEXCEPT;
391 static i128 __vectorcall ConvertFromInt32ToInt64Lo(i128arg value)
NLIB_NOEXCEPT;
392 static i128 __vectorcall ConvertFromInt32ToInt64Hi(i128arg value)
NLIB_NOEXCEPT;
393 static i128 __vectorcall ConvertFromUint8ToUint16Lo(i128arg value)
NLIB_NOEXCEPT;
394 static i128 __vectorcall ConvertFromUint8ToUint16Hi(i128arg value)
NLIB_NOEXCEPT;
395 static i128 __vectorcall ConvertFromUint16ToUint32Lo(i128arg value)
NLIB_NOEXCEPT;
396 static i128 __vectorcall ConvertFromUint16ToUint32Hi(i128arg value)
NLIB_NOEXCEPT;
397 static i128 __vectorcall ConvertFromUint32ToUint64Lo(i128arg value)
NLIB_NOEXCEPT;
398 static i128 __vectorcall ConvertFromUint32ToUint64Hi(i128arg value)
NLIB_NOEXCEPT;
400 static i128 __vectorcall Zip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
401 static i128 __vectorcall Zip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
402 static i128 __vectorcall Unzip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
403 static i128 __vectorcall Unzip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
404 static i128 __vectorcall Zip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
405 static i128 __vectorcall Zip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
406 static i128 __vectorcall Unzip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
407 static i128 __vectorcall Unzip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
408 static i128 __vectorcall Zip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
409 static i128 __vectorcall Zip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
410 static i128 __vectorcall Unzip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT;
411 static i128 __vectorcall Unzip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT;
413 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
414 int V8,
int V9,
int V10,
int V11,
int V12,
int V13,
int V14,
int V15>
415 static i128 __vectorcall Permute8(i128arg a, i128arg b)
NLIB_NOEXCEPT;
416 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
417 static i128 __vectorcall Permute16(i128arg a, i128arg b)
NLIB_NOEXCEPT;
418 template<
int V0,
int V1,
int V2,
int V3>
419 static i128 __vectorcall Permute32(i128arg a, i128arg b)
NLIB_NOEXCEPT;
424 static i128 __vectorcall Reverse16(i128arg value)
NLIB_NOEXCEPT;
425 static i128 __vectorcall Reverse32(i128arg value)
NLIB_NOEXCEPT;
426 static i128 __vectorcall Reverse64(i128arg value)
NLIB_NOEXCEPT;
431 static int __vectorcall MoveMask8(i128arg value)
NLIB_NOEXCEPT;
432 static int __vectorcall MoveMask16(i128arg value)
NLIB_NOEXCEPT;
433 static int __vectorcall MoveMask32(i128arg value)
NLIB_NOEXCEPT;
437 static bool __vectorcall IsZero(i128arg value)
NLIB_NOEXCEPT;
438 static bool __vectorcall IsFull(i128arg value)
NLIB_NOEXCEPT;
439 static i128 __vectorcall Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT;
440 static i128 __vectorcall Shuffle8(i128arg value, i128arg shuffle)
NLIB_NOEXCEPT;
441 static int __vectorcall PopCntMask8(i128arg value)
NLIB_NOEXCEPT;
442 static int __vectorcall ClzMask8(i128arg value)
NLIB_NOEXCEPT;
443 static int __vectorcall CtzMask8(i128arg value)
NLIB_NOEXCEPT;
451 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 452 #define NLIB_M2(tp) inline tp __vectorcall 455 # undef vreinterpret_s8_s8 463 #define vreinterpretq_s8_s8(a) (a) 464 #define NLIB_OP1(intrin, tp, a) \ 465 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a))) 466 #define NLIB_OP2(intrin, tp, a, b) \ 467 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 468 vreinterpretq_##tp##_s8(b))) 469 #define NLIB_OP3(intrin, tp, a, b, c) \ 470 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 471 vreinterpretq_##tp##_s8(b), \ 472 vreinterpretq_##tp##_s8(c))) 473 #define NLIB_CMP(intrin, tp, a, b, utp) \ 474 vreinterpretq_s8_##utp(intrin##_##tp(vreinterpretq_##tp##_s8(a), \ 475 vreinterpretq_##tp##_s8(b))) 476 #define NLIB_SFT(intrin, tp, a, cnt, stp) \ 477 vreinterpretq_s8_##tp(intrin##_##tp(vreinterpretq_##tp##_s8(a), vdupq_n_##stp(cnt))) 478 #define NLIB_CMB(tp, l, h) vreinterpretq_s8_##tp(vcombine_##tp(l, h)) 483 #if defined(NLIB_SSE41) 485 return _mm_shuffle_epi8(_mm_cvtsi32_si128(static_cast<uint8_t>(v)), _mm_setzero_si128());
486 #elif defined(NLIB_NEON) 487 return vdupq_n_s8(v);
493 #if defined(NLIB_SSE41) 494 return _mm_set1_epi16(v);
495 #elif defined(NLIB_NEON) 496 return vreinterpretq_s8_s16(vdupq_n_s16(v));
502 #if defined(NLIB_SSE41) 503 return _mm_set1_epi32(v);
504 #elif defined(NLIB_NEON) 505 return vreinterpretq_s8_s32(vdupq_n_s32(v));
511 #if defined(NLIB_SSE41) 517 return I128::LoadA16(tmp);
519 return _mm_set1_epi64x(v);
521 #elif defined(NLIB_NEON) 522 return vreinterpretq_s8_s64(vdupq_n_s64(v));
528 #if defined(NLIB_SSE41) 530 return _mm_shuffle_epi8(_mm_cvtsi32_si128(v), _mm_setzero_si128());
531 #elif defined(NLIB_NEON) 532 return vreinterpretq_s8_u8(vdupq_n_u8(v));
538 #if defined(NLIB_SSE41) 539 return _mm_set1_epi16(static_cast<int16_t>(v));
540 #elif defined(NLIB_NEON) 541 return vreinterpretq_s8_u16(vdupq_n_u16(v));
547 #if defined(NLIB_SSE41) 548 return _mm_set1_epi32(static_cast<int32_t>(v));
549 #elif defined(NLIB_NEON) 550 return vreinterpretq_s8_u32(vdupq_n_u32(v));
556 #if defined(NLIB_SSE41) 559 return I128::LoadA16(tmp);
561 return _mm_set1_epi64x(static_cast<int64_t>(v));
563 #elif defined(NLIB_NEON) 564 return vreinterpretq_s8_u64(vdupq_n_u64(v));
568 #if defined(NLIB_SSE41) 573 return _mm_shuffle_epi32(value, _MM_SHUFFLE(N, N, N, N));
575 #elif defined(NLIB_NEON) 581 uint32x4_t v = vreinterpretq_u32_s8(value);
582 return vreinterpretq_s8_u32(vdupq_laneq_u32(v, N));
587 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
588 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
592 uint32x2_t v = vget_low_u32(vreinterpretq_u32_s8(value));
593 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
597 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
598 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 0));
602 uint32x2_t v = vget_high_u32(vreinterpretq_u32_s8(value));
603 return vreinterpretq_s8_u32(vdupq_lane_u32(v, 1));
608 #if defined(NLIB_SSE41) 614 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1,
615 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1, 2 * N, 2 * N + 1
617 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(mask));
619 #elif defined(NLIB_NEON) 623 uint16x8_t v = vreinterpretq_u16_s8(value);
624 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 0));
626 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
627 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
634 uint16x8_t v = vreinterpretq_u16_s8(value);
635 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 1));
637 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
638 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
645 uint16x8_t v = vreinterpretq_u16_s8(value);
646 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 2));
648 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
649 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
656 uint16x8_t v = vreinterpretq_u16_s8(value);
657 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 3));
659 uint16x4_t v = vget_low_u16(vreinterpretq_u16_s8(value));
660 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
667 uint16x8_t v = vreinterpretq_u16_s8(value);
668 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 4));
670 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
671 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 0));
678 uint16x8_t v = vreinterpretq_u16_s8(value);
679 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 5));
681 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
682 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 1));
689 uint16x8_t v = vreinterpretq_u16_s8(value);
690 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 6));
692 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
693 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 2));
700 uint16x8_t v = vreinterpretq_u16_s8(value);
701 return vreinterpretq_s8_u16(vdupq_laneq_u16(v, 7));
703 uint16x4_t v = vget_high_u16(vreinterpretq_u16_s8(value));
704 return vreinterpretq_s8_u16(vdupq_lane_u16(v, 3));
709 #if defined(NLIB_SSE41) 715 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
717 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask[0]));
719 #elif defined(NLIB_NEON) 721 template <
size_t N,
bool IsLower>
722 struct SetValue8Helper {
724 return vdupq_lane_s8(vget_low_s8(value), N);
729 struct SetValue8Helper<N, false> {
731 return vdupq_lane_s8(vget_high_s8(value), N - 8);
741 return vdupq_laneq_s8(value, N);
743 return detail::SetValue8Helper<N, (N < 8)>()(value);
750 #if defined(NLIB_SSE41) 751 return _mm_setzero_si128();
752 #elif defined(NLIB_NEON) 753 return vdupq_n_s8(0);
758 NLIB_M(i128) I128::SetFull(i128arg dummy)
NLIB_NOEXCEPT {
return I128::CmpEq8(dummy, dummy); }
762 #if defined(NLIB_SSE41) 763 return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
764 #elif defined(NLIB_NEON) 765 uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
766 return vreinterpretq_s8_u64(tmp);
772 #if defined(NLIB_SSE41) 773 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
774 #elif defined(NLIB_NEON) 775 uint64x2_t tmp = vld1q_u64(reinterpret_cast<const uint64_t*>(p));
776 return vreinterpretq_s8_u64(tmp);
782 #if defined(NLIB_SSE41) 783 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
784 #elif defined(NLIB_NEON) 785 uint32x4_t tmp = vld1q_u32(reinterpret_cast<const uint32_t*>(p));
786 return vreinterpretq_s8_u32(tmp);
792 #if defined(NLIB_SSE41) 793 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
794 #elif defined(NLIB_NEON) 795 uint16x8_t tmp = vld1q_u16(reinterpret_cast<const uint16_t*>(p));
796 return vreinterpretq_s8_u16(tmp);
802 #if defined(NLIB_SSE41) 803 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
804 #elif defined(NLIB_NEON) 805 return vld1q_s8(reinterpret_cast<const int8_t*>(p));
810 #if defined(NLIB_SSE41) 811 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
812 #elif defined(NLIB_NEON) 813 int8x8_t lo = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
814 return vcombine_s8(lo, vdup_n_s8(0));
819 #if defined(NLIB_SSE41) 820 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
821 #elif defined(NLIB_NEON) 822 int8x8_t lo = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
823 return vcombine_s8(lo, vdup_n_s8(0));
828 #if defined(NLIB_SSE41) 829 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
830 #elif defined(NLIB_NEON) 831 int8x8_t lo = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
832 return vcombine_s8(lo, vdup_n_s8(0));
837 #if defined(NLIB_SSE41) 838 return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
839 #elif defined(NLIB_NEON) 840 int8x8_t lo = vld1_s8(reinterpret_cast<const int8_t*>(p));
841 return vcombine_s8(lo, vdup_n_s8(0));
846 #if defined(NLIB_SSE41) 847 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
848 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
849 #elif defined(NLIB_NEON) 850 int8x8_t hi = vreinterpret_s8_u64(vld1_u64(reinterpret_cast<const uint64_t*>(p)));
851 return vcombine_s8(vdup_n_s8(0), hi);
856 #if defined(NLIB_SSE41) 857 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
858 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
859 #elif defined(NLIB_NEON) 860 int8x8_t hi = vreinterpret_s8_u32(vld1_u32(reinterpret_cast<const uint32_t*>(p)));
861 return vcombine_s8(vdup_n_s8(0), hi);
866 #if defined(NLIB_SSE41) 867 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
868 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
869 #elif defined(NLIB_NEON) 870 int8x8_t hi = vreinterpret_s8_u16(vld1_u16(reinterpret_cast<const uint16_t*>(p)));
871 return vcombine_s8(vdup_n_s8(0), hi);
876 #if defined(NLIB_SSE41) 877 __m128i tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
878 return _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
879 #elif defined(NLIB_NEON) 880 int8x8_t hi = vld1_s8(reinterpret_cast<const int8_t*>(p));
881 return vcombine_s8(vdup_n_s8(0), hi);
886 NLIB_M(
void) I128::StoreA16(
void* p, i128arg value)
NLIB_NOEXCEPT {
887 #if defined(NLIB_SSE41) 888 _mm_store_si128(reinterpret_cast<i128*>(p), value);
889 #elif defined(NLIB_NEON) 890 vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
895 NLIB_M(
void) I128::StoreA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
896 #if defined(NLIB_SSE41) 897 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
898 #elif defined(NLIB_NEON) 899 vst1q_u64(reinterpret_cast<uint64_t*>(p), vreinterpretq_u64_s8(value));
904 NLIB_M(
void) I128::StoreA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
905 #if defined(NLIB_SSE41) 906 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
907 #elif defined(NLIB_NEON) 908 vst1q_u32(reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_s8(value));
913 NLIB_M(
void) I128::StoreA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
914 #if defined(NLIB_SSE41) 915 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
916 #elif defined(NLIB_NEON) 917 vst1q_u16(reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_s8(value));
922 NLIB_M(
void) I128::StoreA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
923 #if defined(NLIB_SSE41) 924 _mm_storeu_si128(reinterpret_cast<i128*>(p), value);
925 #elif defined(NLIB_NEON) 926 vst1q_s8(reinterpret_cast<int8_t*>(p), value);
930 NLIB_M(
void) I128::StoreLoA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
931 #if defined(NLIB_SSE41) 932 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
933 #elif defined(NLIB_NEON) 934 uint64x1_t x = vreinterpret_u64_s8(vget_low_s8(value));
935 vst1_u64(reinterpret_cast<uint64_t*>(p), x);
939 NLIB_M(
void) I128::StoreLoA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
940 #if defined(NLIB_SSE41) 941 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
942 #elif defined(NLIB_NEON) 943 uint32x2_t x = vreinterpret_u32_s8(vget_low_s8(value));
944 vst1_u32(reinterpret_cast<uint32_t*>(p), x);
948 NLIB_M(
void) I128::StoreLoA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
949 #if defined(NLIB_SSE41) 950 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
951 #elif defined(NLIB_NEON) 952 uint16x4_t x = vreinterpret_u16_s8(vget_low_s8(value));
953 vst1_u16(reinterpret_cast<uint16_t*>(p), x);
957 NLIB_M(
void) I128::StoreLoA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
958 #if defined(NLIB_SSE41) 959 _mm_storel_epi64(reinterpret_cast<i128*>(p), value);
960 #elif defined(NLIB_NEON) 961 int8x8_t x = vget_low_s8(value);
962 vst1_s8(reinterpret_cast<int8_t*>(p), x);
966 NLIB_M(
void) I128::StoreHiA8(
void* p, i128arg value)
NLIB_NOEXCEPT {
967 #if defined(NLIB_SSE41) 968 _mm_storel_epi64(reinterpret_cast<i128*>(p),
969 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
970 #elif defined(NLIB_NEON) 971 uint64x1_t x = vreinterpret_u64_s8(vget_high_s8(value));
972 vst1_u64(reinterpret_cast<uint64_t*>(p), x);
976 NLIB_M(
void) I128::StoreHiA4(
void* p, i128arg value)
NLIB_NOEXCEPT {
977 #if defined(NLIB_SSE41) 978 _mm_storel_epi64(reinterpret_cast<i128*>(p),
979 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
980 #elif defined(NLIB_NEON) 981 uint32x2_t x = vreinterpret_u32_s8(vget_high_s8(value));
982 vst1_u32(reinterpret_cast<uint32_t*>(p), x);
986 NLIB_M(
void) I128::StoreHiA2(
void* p, i128arg value)
NLIB_NOEXCEPT {
987 #if defined(NLIB_SSE41) 988 _mm_storel_epi64(reinterpret_cast<i128*>(p),
989 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
990 #elif defined(NLIB_NEON) 991 uint16x4_t x = vreinterpret_u16_s8(vget_high_s8(value));
992 vst1_u16(reinterpret_cast<uint16_t*>(p), x);
996 NLIB_M(
void) I128::StoreHiA1(
void* p, i128arg value)
NLIB_NOEXCEPT {
997 #if defined(NLIB_SSE41) 998 _mm_storel_epi64(reinterpret_cast<i128*>(p),
999 _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
1000 #elif defined(NLIB_NEON) 1001 int8x8_t x = vget_high_s8(value);
1002 vst1_s8(reinterpret_cast<int8_t*>(p), x);
1008 NLIB_M(uint8_t) I128::GetUint8FromLane(i128arg value)
NLIB_NOEXCEPT {
1010 #if defined(NLIB_SSE41) 1011 return static_cast<uint8_t
>(_mm_extract_epi8(value, N));
1012 #elif defined(NLIB_NEON) 1013 return vgetq_lane_u8(vreinterpretq_u8_s8(value), N);
1019 NLIB_M(uint16_t) I128::GetUint16FromLane(i128arg value)
NLIB_NOEXCEPT {
1021 #if defined(NLIB_SSE41) 1022 return static_cast<uint16_t
>(_mm_extract_epi16(value, N));
1023 #elif defined(NLIB_NEON) 1024 return vgetq_lane_u16(vreinterpretq_u16_s8(value), N);
1030 NLIB_M(uint32_t) I128::GetUint32FromLane(i128arg value)
NLIB_NOEXCEPT {
1032 #if defined(NLIB_SSE41) 1033 return static_cast<uint32_t
>(_mm_extract_epi32(value, N));
1034 #elif defined(NLIB_NEON) 1035 return vgetq_lane_u32(vreinterpretq_u32_s8(value), N);
1041 NLIB_M(uint64_t) I128::GetUint64FromLane(i128arg value)
NLIB_NOEXCEPT {
1043 #if defined(NLIB_SSE41) 1045 return static_cast<uint64_t
>(_mm_extract_epi64(value, N));
1050 #elif defined(NLIB_NEON) 1051 return vgetq_lane_u64(vreinterpretq_u64_s8(value), N);
1055 #if defined(NLIB_SSE41) && !defined(NLIB_64BIT) 1057 NLIB_M(uint64_t) I128::GetUint64FromLane<0>(i128arg value)
NLIB_NOEXCEPT {
1059 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), value);
1063 NLIB_M(uint64_t) I128::GetUint64FromLane<1>(i128arg value)
NLIB_NOEXCEPT {
1065 i128 tmp = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
1066 _mm_storel_epi64(reinterpret_cast<i128*>(&rval), tmp);
1073 NLIB_M(i128) I128::SetUint8ToLane(i128arg value, uint8_t v)
NLIB_NOEXCEPT {
1075 #if defined(NLIB_SSE41) 1076 return _mm_insert_epi8(value, static_cast<int8_t>(v), N);
1077 #elif defined(NLIB_NEON) 1078 return __builtin_constant_p(v) ?
1095 N == 15 ? 31 : 15>(value, vreinterpretq_s8_u8(vdupq_n_u8(v))) :
1096 vreinterpretq_s8_u8(vsetq_lane_u8(v, vreinterpretq_u8_s8(value), N));
1102 NLIB_M(i128) I128::SetUint16ToLane(i128arg value, uint16_t v)
NLIB_NOEXCEPT {
1104 #if defined(NLIB_SSE41) 1105 return _mm_insert_epi16(value, static_cast<int16_t>(v), N);
1106 #elif defined(NLIB_NEON) 1107 return __builtin_constant_p(v) ?
1116 N == 7 ? 15 : 7>(value, vreinterpretq_s8_u16(vdupq_n_u16(v))) :
1117 vreinterpretq_s8_u16(vsetq_lane_u16(v, vreinterpretq_u16_s8(value), N));
1123 NLIB_M(i128) I128::SetUint32ToLane(i128arg value, uint32_t v)
NLIB_NOEXCEPT {
1125 #if defined(NLIB_SSE41) 1126 return _mm_insert_epi32(value, static_cast<uint32_t>(v), N);
1127 #elif defined(NLIB_NEON) 1128 return __builtin_constant_p(v) ?
1129 I128::Permute32<N == 0 ? 4 : 0,
1132 N == 3 ? 7 : 3>(value, vreinterpretq_s8_u32(vdupq_n_u32(v))) :
1133 vreinterpretq_s8_u32(vsetq_lane_u32(v, vreinterpretq_u32_s8(value), N));
1139 NLIB_M(i128) I128::SetUint64ToLane(i128arg value, uint64_t v)
NLIB_NOEXCEPT {
1141 #if defined(NLIB_SSE41) 1143 return _mm_insert_epi64(value, static_cast<int64_t>(v), N);
1149 tmp.i64 =
static_cast<int64_t
>(v);
1151 rval = _mm_insert_epi32(value, tmp.i32[0], N * 2 + 0);
1152 return _mm_insert_epi32(rval, tmp.i32[1], N * 2 + 1);
1154 #elif defined(NLIB_NEON) 1155 return vreinterpretq_s8_u64(vsetq_lane_u64(v, vreinterpretq_u64_s8(value), N));
1160 NLIB_M(i128) I128::Add8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1161 #if defined(NLIB_SSE41) 1162 return _mm_add_epi8(a, b);
1163 #elif defined(NLIB_NEON) 1164 return vaddq_s8(a, b);
1169 NLIB_M(i128) I128::Add16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1170 #if defined(NLIB_SSE41) 1171 return _mm_add_epi16(a, b);
1172 #elif defined(NLIB_NEON) 1173 return NLIB_OP2(vaddq, s16, a, b);
1178 NLIB_M(i128) I128::Add32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1179 #if defined(NLIB_SSE41) 1180 return _mm_add_epi32(a, b);
1181 #elif defined(NLIB_NEON) 1182 return NLIB_OP2(vaddq, s32, a, b);
1187 NLIB_M(i128) I128::Add64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1188 #if defined(NLIB_SSE41) 1189 return _mm_add_epi64(a, b);
1190 #elif defined(NLIB_NEON) 1191 return NLIB_OP2(vaddq, s64, a, b);
1196 NLIB_M(i128) I128::AddInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1197 #if defined(NLIB_SSE41) 1198 return _mm_adds_epi8(a, b);
1199 #elif defined(NLIB_NEON) 1200 return vqaddq_s8(a, b);
1205 NLIB_M(i128) I128::AddInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1206 #if defined(NLIB_SSE41) 1207 return _mm_adds_epi16(a, b);
1208 #elif defined(NLIB_NEON) 1209 return NLIB_OP2(vqaddq, s16, a, b);
1214 NLIB_M(i128) I128::AddUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1215 #if defined(NLIB_SSE41) 1216 return _mm_adds_epu8(a, b);
1217 #elif defined(NLIB_NEON) 1218 return NLIB_OP2(vqaddq, u8, a, b);
1223 NLIB_M(i128) I128::AddUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1224 #if defined(NLIB_SSE41) 1225 return _mm_adds_epu16(a, b);
1226 #elif defined(NLIB_NEON) 1227 return NLIB_OP2(vqaddq, u16, a, b);
1232 NLIB_M(i128) I128::Sub8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1233 #if defined(NLIB_SSE41) 1234 return _mm_sub_epi8(a, b);
1235 #elif defined(NLIB_NEON) 1236 return vsubq_s8(a, b);
1241 NLIB_M(i128) I128::Sub16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1242 #if defined(NLIB_SSE41) 1243 return _mm_sub_epi16(a, b);
1244 #elif defined(NLIB_NEON) 1245 return NLIB_OP2(vsubq, s16, a, b);
1250 NLIB_M(i128) I128::Sub32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1251 #if defined(NLIB_SSE41) 1252 return _mm_sub_epi32(a, b);
1253 #elif defined(NLIB_NEON) 1254 return NLIB_OP2(vsubq, s32, a, b);
1259 NLIB_M(i128) I128::Sub64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1260 #if defined(NLIB_SSE41) 1261 return _mm_sub_epi64(a, b);
1262 #elif defined(NLIB_NEON) 1263 return NLIB_OP2(vsubq, s64, a, b);
1268 NLIB_M(i128) I128::SubInt8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1269 #if defined(NLIB_SSE41) 1270 return _mm_subs_epi8(a, b);
1271 #elif defined(NLIB_NEON) 1272 return NLIB_OP2(vqsubq, s8, a, b);
1277 NLIB_M(i128) I128::SubInt16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1278 #if defined(NLIB_SSE41) 1279 return _mm_subs_epi16(a, b);
1280 #elif defined(NLIB_NEON) 1281 return NLIB_OP2(vqsubq, s16, a, b);
1286 NLIB_M(i128) I128::SubUint8Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1287 #if defined(NLIB_SSE41) 1288 return _mm_subs_epu8(a, b);
1289 #elif defined(NLIB_NEON) 1290 return NLIB_OP2(vqsubq, u8, a, b);
1295 NLIB_M(i128) I128::SubUint16Saturated(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1296 #if defined(NLIB_SSE41) 1297 return _mm_subs_epu16(a, b);
1298 #elif defined(NLIB_NEON) 1299 return NLIB_OP2(vqsubq, u16, a, b);
1304 NLIB_M(i128) I128::PairwiseAdd8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1305 #if defined(NLIB_SSE41) 1306 __m128i ax = _mm_add_epi8(a, _mm_srli_epi16(a, 8));
1307 __m128i bx = _mm_add_epi8(b, _mm_srli_epi16(b, 8));
1308 return I128::NarrowFrom16To8(ax, bx);
1309 #elif defined(NLIB_NEON) 1311 return vpaddq_s8(a, b);
1313 int8x8_t al = vget_low_s8(a);
1314 int8x8_t ah = vget_high_s8(a);
1315 int8x8_t rl = vpadd_s8(al, ah);
1316 int8x8_t bl = vget_low_s8(b);
1317 int8x8_t bh = vget_high_s8(b);
1318 int8x8_t rh = vpadd_s8(bl, bh);
1319 return vcombine_s8(rl, rh);
1325 NLIB_M(i128) I128::PairwiseAdd16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1326 #if defined(NLIB_SSE41) 1327 return _mm_hadd_epi16(a, b);
1328 #elif defined(NLIB_NEON) 1330 return vreinterpretq_s8_s16(vpaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b)));
1332 int16x4_t al = vget_low_s16(vreinterpretq_s16_s8(a));
1333 int16x4_t ah = vget_high_s16(vreinterpretq_s16_s8(a));
1334 int16x4_t rl = vpadd_s16(al, ah);
1335 int16x4_t bl = vget_low_s16(vreinterpretq_s16_s8(b));
1336 int16x4_t bh = vget_high_s16(vreinterpretq_s16_s8(b));
1337 int16x4_t rh = vpadd_s16(bl, bh);
1338 return NLIB_CMB(s16, rl, rh);
1344 NLIB_M(i128) I128::PairwiseAdd32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1345 #if defined(NLIB_SSE41) 1346 return _mm_hadd_epi32(a, b);
1347 #elif defined(NLIB_NEON) 1349 return vreinterpretq_s8_s32(vpaddq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b)));
1351 int32x2_t al = vget_low_s32(vreinterpretq_s32_s8(a));
1352 int32x2_t ah = vget_high_s32(vreinterpretq_s32_s8(a));
1353 int32x2_t rl = vpadd_s32(al, ah);
1354 int32x2_t bl = vget_low_s32(vreinterpretq_s32_s8(b));
1355 int32x2_t bh = vget_high_s32(vreinterpretq_s32_s8(b));
1356 int32x2_t rh = vpadd_s32(bl, bh);
1357 return NLIB_CMB(s32, rl, rh);
1363 NLIB_M(i128) I128::Mult16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1364 #if defined(NLIB_SSE41) 1365 return _mm_mullo_epi16(a, b);
1366 #elif defined(NLIB_NEON) 1367 return NLIB_OP2(vmulq, s16, a, b);
1372 NLIB_M(i128) I128::MultAdd16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1373 #if defined(NLIB_SSE41) 1374 return _mm_add_epi16(c, _mm_mullo_epi16(a, b));
1375 #elif defined(NLIB_NEON) 1376 return NLIB_OP3(vmlaq, s16, c, a, b);
1381 NLIB_M(i128) I128::MultSub16(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1382 #if defined(NLIB_SSE41) 1383 return _mm_sub_epi16(c, _mm_mullo_epi16(a, b));
1384 #elif defined(NLIB_NEON) 1385 return NLIB_OP3(vmlsq, s16, c, a, b);
1390 NLIB_M(i128) I128::Mult32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1391 #if defined(NLIB_SSE41) 1392 return _mm_mullo_epi32(a, b);
1393 #elif defined(NLIB_NEON) 1394 return NLIB_OP2(vmulq, s32, a, b);
1399 NLIB_M(i128) I128::MultAdd32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1400 #if defined(NLIB_SSE41) 1401 return _mm_add_epi32(c, _mm_mullo_epi32(a, b));
1402 #elif defined(NLIB_NEON) 1403 return NLIB_OP3(vmlaq, s32, c, a, b);
1408 NLIB_M(i128) I128::MultSub32(i128arg a, i128arg b, i128arg c)
NLIB_NOEXCEPT {
1409 #if defined(NLIB_SSE41) 1410 return _mm_sub_epi32(c, _mm_mullo_epi32(a, b));
1411 #elif defined(NLIB_NEON) 1412 return NLIB_OP3(vmlsq, s32, c, a, b);
1417 NLIB_M(i128) I128::MaxInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1418 #if defined(NLIB_SSE41) 1419 return _mm_max_epi8(a, b);
1420 #elif defined(NLIB_NEON) 1421 return NLIB_OP2(vmaxq, s8, a, b);
1426 NLIB_M(i128) I128::MaxInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1427 #if defined(NLIB_SSE41) 1428 return _mm_max_epi16(a, b);
1429 #elif defined(NLIB_NEON) 1430 return NLIB_OP2(vmaxq, s16, a, b);
1435 NLIB_M(i128) I128::MaxInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1436 #if defined(NLIB_SSE41) 1437 return _mm_max_epi32(a, b);
1438 #elif defined(NLIB_NEON) 1439 return NLIB_OP2(vmaxq, s32, a, b);
1444 NLIB_M(i128) I128::MaxUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1445 #if defined(NLIB_SSE41) 1446 return _mm_max_epu8(a, b);
1447 #elif defined(NLIB_NEON) 1448 return NLIB_OP2(vmaxq, u8, a, b);
1453 NLIB_M(i128) I128::MaxUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1454 #if defined(NLIB_SSE41) 1455 return _mm_max_epu16(a, b);
1456 #elif defined(NLIB_NEON) 1457 return NLIB_OP2(vmaxq, u16, a, b);
1462 NLIB_M(i128) I128::MaxUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1463 #if defined(NLIB_SSE41) 1464 return _mm_max_epu32(a, b);
1465 #elif defined(NLIB_NEON) 1466 return NLIB_OP2(vmaxq, u32, a, b);
1471 NLIB_M(i128) I128::MinInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1472 #if defined(NLIB_SSE41) 1473 return _mm_min_epi8(a, b);
1474 #elif defined(NLIB_NEON) 1475 return NLIB_OP2(vminq, s8, a, b);
1480 NLIB_M(i128) I128::MinInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1481 #if defined(NLIB_SSE41) 1482 return _mm_min_epi16(a, b);
1483 #elif defined(NLIB_NEON) 1484 return NLIB_OP2(vminq, s16, a, b);
1489 NLIB_M(i128) I128::MinInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1490 #if defined(NLIB_SSE41) 1491 return _mm_min_epi32(a, b);
1492 #elif defined(NLIB_NEON) 1493 return NLIB_OP2(vminq, s32, a, b);
1498 NLIB_M(i128) I128::MinUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1499 #if defined(NLIB_SSE41) 1500 return _mm_min_epu8(a, b);
1501 #elif defined(NLIB_NEON) 1502 return NLIB_OP2(vminq, u8, a, b);
1507 NLIB_M(i128) I128::MinUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1508 #if defined(NLIB_SSE41) 1509 return _mm_min_epu16(a, b);
1510 #elif defined(NLIB_NEON) 1511 return NLIB_OP2(vminq, u16, a, b);
1516 NLIB_M(i128) I128::MinUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1517 #if defined(NLIB_SSE41) 1518 return _mm_min_epu32(a, b);
1519 #elif defined(NLIB_NEON) 1520 return NLIB_OP2(vminq, u32, a, b);
1526 #if defined(NLIB_SSE41) 1527 return _mm_abs_epi8(value);
1528 #elif defined(NLIB_NEON) 1529 return NLIB_OP1(vabsq, s8, value);
1535 #if defined(NLIB_SSE41) 1536 return _mm_abs_epi16(value);
1537 #elif defined(NLIB_NEON) 1538 return NLIB_OP1(vabsq, s16, value);
1544 #if defined(NLIB_SSE41) 1545 return _mm_abs_epi32(value);
1546 #elif defined(NLIB_NEON) 1547 return NLIB_OP1(vabsq, s32, value);
1552 NLIB_M(i128) I128::AbsDiffInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1553 #if defined(NLIB_SSE41) 1554 return _mm_abs_epi8(_mm_sub_epi8(a, b));
1555 #elif defined(NLIB_NEON) 1556 return NLIB_OP2(vabdq, s8, a, b);
1561 NLIB_M(i128) I128::AbsDiffInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1562 #if defined(NLIB_SSE41) 1563 return _mm_abs_epi16(_mm_sub_epi16(a, b));
1564 #elif defined(NLIB_NEON) 1565 return NLIB_OP2(vabdq, s16, a, b);
1570 NLIB_M(i128) I128::AbsDiffInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1571 #if defined(NLIB_SSE41) 1572 return _mm_abs_epi32(_mm_sub_epi32(a, b));
1573 #elif defined(NLIB_NEON) 1574 return NLIB_OP2(vabdq, s32, a, b);
1579 NLIB_M(i128) I128::NegateInt8(i128arg value)
NLIB_NOEXCEPT {
1580 #if defined(NLIB_SSE41) 1581 return _mm_sub_epi8(_mm_setzero_si128(), value);
1582 #elif defined(NLIB_NEON) 1583 return NLIB_OP1(vnegq, s8, value);
1588 NLIB_M(i128) I128::NegateInt16(i128arg value)
NLIB_NOEXCEPT {
1589 #if defined(NLIB_SSE41) 1590 return _mm_sub_epi16(_mm_setzero_si128(), value);
1591 #elif defined(NLIB_NEON) 1592 return NLIB_OP1(vnegq, s16, value);
1597 NLIB_M(i128) I128::NegateInt32(i128arg value)
NLIB_NOEXCEPT {
1598 #if defined(NLIB_SSE41) 1599 return _mm_sub_epi32(_mm_setzero_si128(), value);
1600 #elif defined(NLIB_NEON) 1601 return NLIB_OP1(vnegq, s32, value);
1606 NLIB_M(i128) I128::And(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1607 #if defined(NLIB_SSE41) 1608 return _mm_and_si128(a, b);
1609 #elif defined(NLIB_NEON) 1610 return NLIB_OP2(vandq, s8, a, b);
1616 #if defined(NLIB_SSE41) 1617 return _mm_or_si128(a, b);
1618 #elif defined(NLIB_NEON) 1619 return NLIB_OP2(vorrq, s8, a, b);
1624 NLIB_M(i128) I128::Xor(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1625 #if defined(NLIB_SSE41) 1626 return _mm_xor_si128(a, b);
1627 #elif defined(NLIB_NEON) 1628 return NLIB_OP2(veorq, s8, a, b);
1634 #if defined(NLIB_SSE41) 1635 return _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1636 #elif defined(NLIB_NEON) 1637 return NLIB_OP1(vmvnq, s8, a);
1642 NLIB_M(i128) I128::AndNot(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1643 #if defined(NLIB_SSE41) 1644 return _mm_andnot_si128(a, b);
1645 #elif defined(NLIB_NEON) 1646 return NLIB_OP2(vbicq, s8, b, a);
1651 NLIB_M(i128) I128::OrNot(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1652 #if defined(NLIB_SSE41) 1653 __m128i not_a = _mm_andnot_si128(a, _mm_cmpeq_epi8(a, a));
1654 return _mm_or_si128(not_a, b);
1655 #elif defined(NLIB_NEON) 1656 return NLIB_OP2(vornq, s8, b, a);
1660 NLIB_M(i128) I128::Test8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1661 #if defined(NLIB_NEON) 1662 return vtstq_s8(a, b);
1664 return I128::Not(I128::CmpEqZero8(I128::And(a, b)));
1668 NLIB_M(i128) I128::Test16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1669 #if defined(NLIB_NEON) 1670 return NLIB_OP2(vtstq, s16, a, b);
1672 return I128::Not(I128::CmpEqZero16(I128::And(a, b)));
1676 NLIB_M(i128) I128::Test32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1677 #if defined(NLIB_NEON) 1678 return NLIB_OP2(vtstq, s32, a, b);
1680 return I128::Not(I128::CmpEqZero32(I128::And(a, b)));
1685 NLIB_M(i128) I128::CmpEq8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1686 #if defined(NLIB_SSE41) 1687 return _mm_cmpeq_epi8(a, b);
1688 #elif defined(NLIB_NEON) 1689 return NLIB_CMP(vceqq, s8, a, b, u8);
1694 NLIB_M(i128) I128::CmpEq16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1695 #if defined(NLIB_SSE41) 1696 return _mm_cmpeq_epi16(a, b);
1697 #elif defined(NLIB_NEON) 1698 return NLIB_CMP(vceqq, s16, a, b, u16);
1703 NLIB_M(i128) I128::CmpEq32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1704 #if defined(NLIB_SSE41) 1705 return _mm_cmpeq_epi32(a, b);
1706 #elif defined(NLIB_NEON) 1707 return NLIB_CMP(vceqq, s32, a, b, u32);
1712 NLIB_M(i128) I128::CmpEq64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1713 #if defined(NLIB_SSE41) 1714 return _mm_cmpeq_epi64(a, b);
1715 #elif defined(NLIB_NEON) 1717 return NLIB_CMP(vceqq, s64, a, b, u64);
1719 uint32x4_t x0 = vceqq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b));
1720 uint32x2x2_t x1 = vtrn_u32(vget_low_u32(x0), vget_high_u32(x0));
1721 uint32x2_t x2 = vand_u32(x1.val[0], x1.val[1]);
1722 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1723 return vreinterpretq_s8_s64(result);
1729 NLIB_M(i128) I128::CmpLtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1730 #if defined(NLIB_SSE41) 1731 return _mm_cmplt_epi8(a, b);
1732 #elif defined(NLIB_NEON) 1733 return NLIB_CMP(vcltq, s8, a, b, u8);
1738 NLIB_M(i128) I128::CmpLtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1739 #if defined(NLIB_SSE41) 1740 return _mm_cmplt_epi16(a, b);
1741 #elif defined(NLIB_NEON) 1742 return NLIB_CMP(vcltq, s16, a, b, u16);
1747 NLIB_M(i128) I128::CmpLtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1748 #if defined(NLIB_SSE41) 1749 return _mm_cmplt_epi32(a, b);
1750 #elif defined(NLIB_NEON) 1751 return NLIB_CMP(vcltq, s32, a, b, u32);
1756 NLIB_M(i128) I128::CmpLtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1757 #if defined(NLIB_SSE42) 1758 return _mm_cmpgt_epi64(b, a);
1759 #elif defined(NLIB_NEON) 1761 return NLIB_CMP(vcltq, s64, a, b, u64);
1763 int32x2x2_t trn_a = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(a)),
1764 vreinterpret_s32_s8(vget_high_s8(a)));
1765 int32x2x2_t trn_b = vtrn_s32(vreinterpret_s32_s8(vget_low_s8(b)),
1766 vreinterpret_s32_s8(vget_high_s8(b)));
1767 uint32x2_t upper_lt = vclt_s32(trn_a.val[1], trn_b.val[1]);
1768 uint32x2_t upper_eq = vceq_s32(trn_a.val[1], trn_b.val[1]);
1769 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1770 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1771 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1772 return vreinterpretq_s8_s64(result);
1775 i128 cmp = I128::CmpLtInt32(a, b);
1776 i128 eq = I128::CmpEq32(a, b);
1777 i128 cmp_lt = I128::CmpLtUint32(a, b);
1778 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1779 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp_lt, cmp_lt);
1780 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1781 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1786 NLIB_M(i128) I128::CmpGtInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1787 #if defined(NLIB_SSE41) 1788 return _mm_cmpgt_epi8(a, b);
1789 #elif defined(NLIB_NEON) 1790 return NLIB_CMP(vcgtq, s8, a, b, u8);
1795 NLIB_M(i128) I128::CmpGtInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1796 #if defined(NLIB_SSE41) 1797 return _mm_cmpgt_epi16(a, b);
1798 #elif defined(NLIB_NEON) 1799 return NLIB_CMP(vcgtq, s16, a, b, u16);
1804 NLIB_M(i128) I128::CmpGtInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1805 #if defined(NLIB_SSE41) 1806 return _mm_cmpgt_epi32(a, b);
1807 #elif defined(NLIB_NEON) 1808 return NLIB_CMP(vcgtq, s32, a, b, u32);
1813 NLIB_M(i128) I128::CmpGtInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1814 #if defined(NLIB_SSE42) 1815 return _mm_cmpgt_epi64(a, b);
1816 #elif defined(NLIB_NEON) && defined(__aarch64__) 1817 return NLIB_CMP(vcgtq, s64, a, b, u64);
1819 return I128::CmpLtInt64(b, a);
1824 NLIB_M(i128) I128::CmpLtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1825 #if defined(NLIB_SSE41) 1826 i128 ofs = I128::SetValue(0x80, each_uint8);
1827 return _mm_cmplt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1828 #elif defined(NLIB_NEON) 1829 return NLIB_CMP(vcltq, u8, a, b, u8);
1834 NLIB_M(i128) I128::CmpGtUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1835 #if defined(NLIB_SSE41) 1836 i128 ofs = I128::SetValue(0x80, each_uint8);
1837 return _mm_cmpgt_epi8(_mm_add_epi8(a, ofs), _mm_add_epi8(b, ofs));
1838 #elif defined(NLIB_NEON) 1839 return NLIB_CMP(vcgtq, u8, a, b, u8);
1844 NLIB_M(i128) I128::CmpLtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1845 #if defined(NLIB_SSE41) 1846 i128 ofs = I128::SetValue(0x8000U, each_uint16);
1847 return _mm_cmplt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1848 #elif defined(NLIB_NEON) 1849 return NLIB_CMP(vcltq, u16, a, b, u16);
1854 NLIB_M(i128) I128::CmpGtUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1855 #if defined(NLIB_SSE41) 1856 i128 ofs = I128::SetValue(0x8000U, each_uint16);
1857 return _mm_cmpgt_epi16(_mm_add_epi16(a, ofs), _mm_add_epi16(b, ofs));
1858 #elif defined(NLIB_NEON) 1859 return NLIB_CMP(vcgtq, u16, a, b, u16);
1864 NLIB_M(i128) I128::CmpLtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1865 #if defined(NLIB_SSE41) 1866 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1867 return _mm_cmplt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1868 #elif defined(NLIB_NEON) 1869 return NLIB_CMP(vcltq, u32, a, b, u32);
1874 NLIB_M(i128) I128::CmpGtUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1875 #if defined(NLIB_SSE41) 1876 i128 ofs = I128::SetValue(0x80000000U, each_uint32);
1877 return _mm_cmpgt_epi32(_mm_add_epi32(a, ofs), _mm_add_epi32(b, ofs));
1878 #elif defined(NLIB_NEON) 1879 return NLIB_CMP(vcgtq, u32, a, b, u32);
1884 NLIB_M(i128) I128::CmpLtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1885 #if defined(NLIB_SSE42) 1886 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1887 return _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
1888 #elif defined(NLIB_NEON) 1890 return NLIB_CMP(vcltq, u64, a, b, u64);
1892 uint32x2x2_t trn_a = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(a)),
1893 vreinterpret_u32_s8(vget_high_s8(a)));
1894 uint32x2x2_t trn_b = vtrn_u32(vreinterpret_u32_s8(vget_low_s8(b)),
1895 vreinterpret_u32_s8(vget_high_s8(b)));
1896 uint32x2_t upper_lt = vclt_u32(trn_a.val[1], trn_b.val[1]);
1897 uint32x2_t upper_eq = vceq_u32(trn_a.val[1], trn_b.val[1]);
1898 uint32x2_t lower_lt = vclt_u32(trn_a.val[0], trn_b.val[0]);
1899 uint32x2_t x2 = vorr_u32(upper_lt, vand_u32(upper_eq, lower_lt));
1900 int64x2_t result = vmovl_s32(vreinterpret_s32_u32(x2));
1901 return vreinterpretq_s8_s64(result);
1904 i128 cmp = I128::CmpLtUint32(a, b);
1905 i128 eq = I128::CmpEq32(a, b);
1906 i128 upper_lt = I128::Permute32<1, 1, 3, 3>(cmp, cmp);
1907 i128 lower_lt = I128::Permute32<0, 0, 2, 2>(cmp, cmp);
1908 i128 upper_eq = I128::Permute32<1, 1, 3, 3>(eq, eq);
1909 return I128::Or(upper_lt, I128::And(upper_eq, lower_lt));
1914 NLIB_M(i128) I128::CmpGtUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1915 #if defined(NLIB_SSE42) 1916 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
1917 return _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
1918 #elif defined(NLIB_NEON) && defined(__aarch64__) 1919 return NLIB_CMP(vcgtq, u64, a, b, u64);
1921 return I128::CmpLtUint64(b, a);
1926 NLIB_M(i128) I128::CmpLeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1927 #if defined(NLIB_SSE41) 1928 return _mm_or_si128(_mm_cmplt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1929 #elif defined(NLIB_NEON) 1930 return NLIB_CMP(vcleq, s8, a, b, u8);
1935 NLIB_M(i128) I128::CmpLeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1936 #if defined(NLIB_SSE41) 1937 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1938 #elif defined(NLIB_NEON) 1939 return NLIB_CMP(vcleq, s16, a, b, u16);
1944 NLIB_M(i128) I128::CmpLeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1945 #if defined(NLIB_SSE41) 1946 return _mm_or_si128(_mm_cmplt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1947 #elif defined(NLIB_NEON) 1948 return NLIB_CMP(vcleq, s32, a, b, u32);
1953 NLIB_M(i128) I128::CmpLeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1954 #if defined(NLIB_SSE42) 1955 return _mm_or_si128(_mm_cmpgt_epi64(b, a), _mm_cmpeq_epi64(a, b));
1956 #elif defined(NLIB_NEON) && defined(__aarch64__) 1957 return NLIB_CMP(vcleq, s64, a, b, u64);
1959 return I128::Not(I128::CmpGtInt64(a, b));
1964 NLIB_M(i128) I128::CmpGeInt8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1965 #if defined(NLIB_SSE41) 1966 return _mm_or_si128(_mm_cmpgt_epi8(a, b), _mm_cmpeq_epi8(a, b));
1967 #elif defined(NLIB_NEON) 1968 return NLIB_CMP(vcgeq, s8, a, b, u8);
1973 NLIB_M(i128) I128::CmpGeInt16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1974 #if defined(NLIB_SSE41) 1975 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
1976 #elif defined(NLIB_NEON) 1977 return NLIB_CMP(vcgeq, s16, a, b, u16);
1982 NLIB_M(i128) I128::CmpGeInt32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1983 #if defined(NLIB_SSE41) 1984 return _mm_or_si128(_mm_cmpgt_epi32(a, b), _mm_cmpeq_epi32(a, b));
1985 #elif defined(NLIB_NEON) 1986 return NLIB_CMP(vcgeq, s32, a, b, u32);
1991 NLIB_M(i128) I128::CmpGeInt64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
1992 #if defined(NLIB_SSE42) 1993 return _mm_or_si128(_mm_cmpgt_epi64(a, b), _mm_cmpeq_epi64(a, b));
1994 #elif defined(NLIB_NEON) && defined(__aarch64__) 1995 return NLIB_CMP(vcgeq, s64, a, b, u64);
1997 return I128::Not(I128::CmpLtInt64(a, b));
2002 NLIB_M(i128) I128::CmpLeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2003 #if defined(NLIB_SSE41) 2004 return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
2005 #elif defined(NLIB_NEON) 2006 return NLIB_CMP(vcleq, u8, a, b, u8);
2011 NLIB_M(i128) I128::CmpLeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2012 #if defined(NLIB_SSE41) 2013 return _mm_cmpeq_epi16(_mm_min_epu16(a, b), a);
2014 #elif defined(NLIB_NEON) 2015 return NLIB_CMP(vcleq, u16, a, b, u16);
2020 NLIB_M(i128) I128::CmpLeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2021 #if defined(NLIB_SSE41) 2022 return _mm_cmpeq_epi32(_mm_min_epu32(a, b), a);
2023 #elif defined(NLIB_NEON) 2024 return NLIB_CMP(vcleq, u32, a, b, u32);
2029 NLIB_M(i128) I128::CmpLeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2030 #if defined(NLIB_SSE42) 2031 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2032 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(b, ofs), _mm_add_epi64(a, ofs));
2033 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2034 #elif defined(NLIB_NEON) && defined(__aarch64__) 2035 return NLIB_CMP(vcleq, u64, a, b, u64);
2037 return I128::Not(I128::CmpGtUint64(a, b));
2042 NLIB_M(i128) I128::CmpGeUint8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2043 #if defined(NLIB_SSE41) 2044 return _mm_cmpeq_epi8(_mm_max_epu8(a, b), a);
2045 #elif defined(NLIB_NEON) 2046 return NLIB_CMP(vcgeq, u8, a, b, u8);
2051 NLIB_M(i128) I128::CmpGeUint16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2052 #if defined(NLIB_SSE41) 2053 return _mm_cmpeq_epi16(_mm_max_epu16(a, b), a);
2054 #elif defined(NLIB_NEON) 2055 return NLIB_CMP(vcgeq, u16, a, b, u16);
2060 NLIB_M(i128) I128::CmpGeUint32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2061 #if defined(NLIB_SSE41) 2062 return _mm_cmpeq_epi32(_mm_max_epu32(a, b), a);
2063 #elif defined(NLIB_NEON) 2064 return NLIB_CMP(vcgeq, u32, a, b, u32);
2069 NLIB_M(i128) I128::CmpGeUint64(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2070 #if defined(NLIB_SSE42) 2071 i128 ofs = I128::SetValue(0x8000000000000000ULL, each_uint64);
2072 i128 mask = _mm_cmpgt_epi64(_mm_add_epi64(a, ofs), _mm_add_epi64(b, ofs));
2073 return _mm_or_si128(mask, _mm_cmpeq_epi64(a, b));
2074 #elif defined(NLIB_NEON) && defined(__aarch64__) 2075 return NLIB_CMP(vcgeq, u64, a, b, u64);
2077 return I128::Not(I128::CmpLtUint64(a, b));
2081 NLIB_M(i128) I128::CmpEqZero8(i128arg value)
NLIB_NOEXCEPT {
2082 #if defined(__aarch64__) 2083 return vceqzq_s8(value);
2085 return I128::CmpEq8(value, I128::SetZero());
2089 NLIB_M(i128) I128::CmpEqZero16(i128arg value)
NLIB_NOEXCEPT {
2090 #if defined(__aarch64__) 2091 return vreinterpretq_s8_s16(vceqzq_s16(vreinterpretq_s16_s8(value)));
2093 return I128::CmpEq16(value, I128::SetZero());
2097 NLIB_M(i128) I128::CmpEqZero32(i128arg value)
NLIB_NOEXCEPT {
2098 #if defined(__aarch64__) 2099 return vreinterpretq_s8_s32(vceqzq_s32(vreinterpretq_s32_s8(value)));
2101 return I128::CmpEq32(value, I128::SetZero());
2105 NLIB_M(i128) I128::CmpEqZero64(i128arg value)
NLIB_NOEXCEPT {
2106 #if defined(__aarch64__) 2107 return vreinterpretq_s8_s64(vceqzq_s64(vreinterpretq_s64_s8(value)));
2109 return I128::CmpEq64(value, I128::SetZero());
2114 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value,
int count)
NLIB_NOEXCEPT {
2115 #if defined(NLIB_SSE41) 2116 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2117 __m128i xh = _mm_slli_epi16(_mm_cvtepu8_epi16(hi), count);
2118 __m128i xl = _mm_slli_epi16(_mm_cvtepu8_epi16(value), count);
2119 return I128::NarrowFrom16To8(xl, xh);
2120 #elif defined(NLIB_NEON) 2121 return NLIB_SFT(vshlq, u8, value, count, s8);
2126 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value,
int count)
NLIB_NOEXCEPT {
2127 #if defined(NLIB_SSE41) 2128 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2129 __m128i xh = _mm_srli_epi16(_mm_cvtepu8_epi16(hi), count);
2130 __m128i xl = _mm_srli_epi16(_mm_cvtepu8_epi16(value), count);
2131 return _mm_packus_epi16(xl, xh);
2132 #elif defined(NLIB_NEON) 2133 return NLIB_SFT(vshlq, u8, value, -count, s8);
2138 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value,
int count)
NLIB_NOEXCEPT {
2139 #if defined(NLIB_SSE41) 2140 __m128i hi = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2));
2141 __m128i xh = _mm_srai_epi16(_mm_cvtepi8_epi16(hi), count);
2142 __m128i xl = _mm_srai_epi16(_mm_cvtepi8_epi16(value), count);
2143 return _mm_packus_epi16(xl, xh);
2144 #elif defined(NLIB_NEON) 2145 return NLIB_SFT(vshlq, s8, value, -count, s8);
2150 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value,
int count)
NLIB_NOEXCEPT {
2151 #if defined(NLIB_SSE41) 2152 return _mm_slli_epi16(value, count);
2153 #elif defined(NLIB_NEON) 2154 return NLIB_SFT(vshlq, u16, value, count, s16);
2159 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value,
int count)
NLIB_NOEXCEPT {
2160 #if defined(NLIB_SSE41) 2161 return _mm_srli_epi16(value, count);
2162 #elif defined(NLIB_NEON) 2163 return NLIB_SFT(vshlq, u16, value, -count, s16);
2168 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value,
int count)
NLIB_NOEXCEPT {
2169 #if defined(NLIB_SSE41) 2170 return _mm_srai_epi16(value, count);
2171 #elif defined(NLIB_NEON) 2172 return NLIB_SFT(vshlq, s16, value, -count, s16);
2177 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value,
int count)
NLIB_NOEXCEPT {
2178 #if defined(NLIB_SSE41) 2179 return _mm_slli_epi32(value, count);
2180 #elif defined(NLIB_NEON) 2181 return NLIB_SFT(vshlq, u32, value, count, s32);
2186 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value,
int count)
NLIB_NOEXCEPT {
2187 #if defined(NLIB_SSE41) 2188 return _mm_srli_epi32(value, count);
2189 #elif defined(NLIB_NEON) 2190 return NLIB_SFT(vshlq, u32, value, -count, s32);
2195 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value,
int count)
NLIB_NOEXCEPT {
2196 #if defined(NLIB_SSE41) 2197 return _mm_srai_epi32(value, count);
2198 #elif defined(NLIB_NEON) 2199 return NLIB_SFT(vshlq, s32, value, -count, s32);
2204 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value,
int count)
NLIB_NOEXCEPT {
2205 #if defined(NLIB_SSE41) 2206 return _mm_slli_epi64(value, count);
2207 #elif defined(NLIB_NEON) 2208 return NLIB_SFT(vshlq, u64, value, count, s64);
2213 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value,
int count)
NLIB_NOEXCEPT {
2214 #if defined(NLIB_SSE41) 2215 return _mm_srli_epi64(value, count);
2216 #elif defined(NLIB_NEON) 2217 return NLIB_SFT(vshlq, u64, value, -count, s64);
2222 NLIB_M(i128) I128::ShiftLeftLogical8(i128arg value)
NLIB_NOEXCEPT {
2225 return vshlq_n_s8(value, N);
2227 return I128::ShiftLeftLogical8(value, N);
2232 NLIB_M(i128) I128::ShiftRightLogical8(i128arg value)
NLIB_NOEXCEPT {
2235 uint8x16_t tmp = vreinterpretq_u8_s8(value);
2236 return vreinterpretq_s8_u8(vshrq_n_u8(tmp, N));
2238 return I128::ShiftRightLogical8(value, N);
2243 NLIB_M(i128) I128::ShiftRightArithmetic8(i128arg value)
NLIB_NOEXCEPT {
2246 return vshrq_n_s8(value, N);
2248 return I128::ShiftRightArithmetic8(value, N);
2253 NLIB_M(i128) I128::ShiftLeftLogical16(i128arg value)
NLIB_NOEXCEPT {
2256 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2257 return vreinterpretq_s8_u16(vshlq_n_u16(tmp, N));
2259 return I128::ShiftLeftLogical16(value, N);
2264 NLIB_M(i128) I128::ShiftRightLogical16(i128arg value)
NLIB_NOEXCEPT {
2267 uint16x8_t tmp = vreinterpretq_u16_s8(value);
2268 return vreinterpretq_s8_u16(vshrq_n_u16(tmp, N));
2270 return I128::ShiftRightLogical16(value, N);
2275 NLIB_M(i128) I128::ShiftRightArithmetic16(i128arg value)
NLIB_NOEXCEPT {
2278 int16x8_t tmp = vreinterpretq_s16_s8(value);
2279 return vreinterpretq_s8_s16(vshrq_n_s16(tmp, N));
2281 return I128::ShiftRightArithmetic16(value, N);
2286 NLIB_M(i128) I128::ShiftLeftLogical32(i128arg value)
NLIB_NOEXCEPT {
2289 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2290 return vreinterpretq_s8_u32(vshlq_n_u32(tmp, N));
2292 return I128::ShiftLeftLogical32(value, N);
2297 NLIB_M(i128) I128::ShiftRightLogical32(i128arg value)
NLIB_NOEXCEPT {
2300 uint32x4_t tmp = vreinterpretq_u32_s8(value);
2301 return vreinterpretq_s8_u32(vshrq_n_u32(tmp, N));
2303 return I128::ShiftRightLogical32(value, N);
2308 NLIB_M(i128) I128::ShiftRightArithmetic32(i128arg value)
NLIB_NOEXCEPT {
2311 int32x4_t tmp = vreinterpretq_s32_s8(value);
2312 return vreinterpretq_s8_s32(vshrq_n_s32(tmp, N));
2314 return I128::ShiftRightArithmetic32(value, N);
2319 NLIB_M(i128) I128::ShiftLeftLogical64(i128arg value)
NLIB_NOEXCEPT {
2322 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2323 return vreinterpretq_s8_u64(vshlq_n_u64(tmp, N));
2325 return I128::ShiftLeftLogical64(value, N);
2330 NLIB_M(i128) I128::ShiftRightLogical64(i128arg value)
NLIB_NOEXCEPT {
2333 uint64x2_t tmp = vreinterpretq_u64_s8(value);
2334 return vreinterpretq_s8_u64(vshrq_n_u64(tmp, N));
2336 return I128::ShiftRightLogical64(value, N);
2342 NLIB_M(i128) I128::ShiftLeftLogical8<8>(i128arg value)
NLIB_NOEXCEPT {
2344 return I128::SetZero();
2347 NLIB_M(i128) I128::ShiftRightLogical8<0>(i128arg value)
NLIB_NOEXCEPT {
2351 NLIB_M(i128) I128::ShiftLeftLogical16<16>(i128arg value)
NLIB_NOEXCEPT {
2353 return I128::SetZero();
2356 NLIB_M(i128) I128::ShiftRightLogical16<0>(i128arg value)
NLIB_NOEXCEPT {
2360 NLIB_M(i128) I128::ShiftRightArithmetic16<0>(i128arg value)
NLIB_NOEXCEPT {
2364 NLIB_M(i128) I128::ShiftLeftLogical32<32>(i128arg value)
NLIB_NOEXCEPT {
2366 return I128::SetZero();
2369 NLIB_M(i128) I128::ShiftRightLogical32<0>(i128arg value)
NLIB_NOEXCEPT {
2373 NLIB_M(i128) I128::ShiftRightArithmetic32<0>(i128arg value)
NLIB_NOEXCEPT {
2377 NLIB_M(i128) I128::ShiftLeftLogical64<64>(i128arg value)
NLIB_NOEXCEPT {
2379 return I128::SetZero();
2382 NLIB_M(i128) I128::ShiftRightLogical64<0>(i128arg value)
NLIB_NOEXCEPT {
2389 NLIB_M(i128) I128::ByteShiftLeft(i128arg value)
NLIB_NOEXCEPT {
2391 #if defined(NLIB_SSE41) 2392 return _mm_slli_si128(value, N);
2393 #elif defined(NLIB_NEON) 2394 return vextq_s8(vdupq_n_s8(0), value, 16 - N);
2400 NLIB_M(i128) I128::ByteShiftRight(i128arg value)
NLIB_NOEXCEPT {
2402 #if defined(NLIB_SSE41) 2403 return _mm_srli_si128(value, N);
2404 #elif defined(NLIB_NEON) 2405 return vextq_s8(value, vdupq_n_s8(0), N);
2411 NLIB_M(i128) I128::ByteRotateRight(i128arg value)
NLIB_NOEXCEPT {
2413 #if defined(NLIB_SSE41) 2414 return _mm_alignr_epi8(value, value, N);
2415 #elif defined(NLIB_NEON) 2416 return vextq_s8(value, value, N);
2422 NLIB_M(i128) I128::AlignR(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2424 #if defined(NLIB_SSE41) 2425 return _mm_alignr_epi8(a, b, N);
2426 #elif defined(NLIB_NEON) 2427 return vextq_s8(b, a, N);
2432 NLIB_M(i128) I128::NarrowFrom16To8(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2433 #if defined(NLIB_SSE41) 2434 i128 mask = I128::SetValue(0x00FFU, each_uint16);
2435 __m128i lo_mask = _mm_and_si128(lo, mask);
2436 __m128i hi_mask = _mm_and_si128(hi, mask);
2437 return _mm_packus_epi16(lo_mask, hi_mask);
2438 #elif defined(NLIB_NEON) 2440 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2441 return vreinterpretq_s8_u8(vmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2443 uint8x8_t l = vmovn_u16(vreinterpretq_u16_s8(lo));
2444 uint8x8_t h = vmovn_u16(vreinterpretq_u16_s8(hi));
2445 return NLIB_CMB(u8, l, h);
2451 NLIB_M(i128) I128::NarrowFrom32To16(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2452 #if defined(NLIB_SSE41) 2453 i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2454 __m128i lo_mask = _mm_and_si128(lo, mask);
2455 __m128i hi_mask = _mm_and_si128(hi, mask);
2456 return _mm_packus_epi32(lo_mask, hi_mask);
2457 #elif defined(NLIB_NEON) 2459 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2460 return vreinterpretq_s8_u16(vmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2462 uint16x4_t l = vmovn_u32(vreinterpretq_u32_s8(lo));
2463 uint16x4_t h = vmovn_u32(vreinterpretq_u32_s8(hi));
2464 return NLIB_CMB(u16, l, h);
2470 NLIB_M(i128) I128::NarrowFrom64To32(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2471 #if defined(NLIB_SSE41) 2472 __m128i lo_ = _mm_shuffle_epi32(lo, _MM_SHUFFLE(3, 1, 2, 0));
2473 __m128i hi_ = _mm_shuffle_epi32(hi, _MM_SHUFFLE(3, 1, 2, 0));
2474 return _mm_unpacklo_epi64(lo_, hi_);
2475 #elif defined(NLIB_NEON) 2477 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2478 return vreinterpretq_s8_u32(vmovn_high_u64(l, vreinterpretq_u64_s8(hi)));
2480 uint32x2_t l = vmovn_u64(vreinterpretq_u64_s8(lo));
2481 uint32x2_t h = vmovn_u64(vreinterpretq_u64_s8(hi));
2482 return NLIB_CMB(u32, l, h);
2488 NLIB_M(i128) I128::ConvertFromUint16ToUint8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2489 #if defined(NLIB_SSE41) 2490 i128 b7FFF = I128::SetValue(0x7FFFU, each_uint16);
2491 __m128i lotmp = _mm_and_si128(lo, b7FFF);
2492 __m128i hitmp = _mm_and_si128(hi, b7FFF);
2493 return _mm_packus_epi16(lotmp, hitmp);
2494 #elif defined(NLIB_NEON) 2496 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2497 return vreinterpretq_s8_u8(vqmovn_high_u16(l, vreinterpretq_u16_s8(hi)));
2499 uint8x8_t l = vqmovn_u16(vreinterpretq_u16_s8(lo));
2500 uint8x8_t h = vqmovn_u16(vreinterpretq_u16_s8(hi));
2501 return NLIB_CMB(u8, l, h);
2507 NLIB_M(i128) I128::ConvertFromInt16ToInt8Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2508 #if defined(NLIB_SSE41) 2509 return _mm_packs_epi16(lo, hi);
2510 #elif defined(NLIB_NEON) 2512 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2513 return vqmovn_high_s16(l, vreinterpretq_s16_s8(hi));
2515 int8x8_t l = vqmovn_s16(vreinterpretq_s16_s8(lo));
2516 int8x8_t h = vqmovn_s16(vreinterpretq_s16_s8(hi));
2517 return NLIB_CMB(s8, l, h);
2523 NLIB_M(i128) I128::ConvertFromUint32ToUint16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2524 #if defined(NLIB_SSE41) 2525 i128 b7FFFFFFF = I128::SetValue(0x7FFFFFFFU, each_uint32);
2526 __m128i lotmp = _mm_and_si128(lo, b7FFFFFFF);
2527 __m128i hitmp = _mm_and_si128(hi, b7FFFFFFF);
2528 return _mm_packus_epi32(lotmp, hitmp);
2529 #elif defined(NLIB_NEON) 2531 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2532 return vreinterpretq_s8_u16(vqmovn_high_u32(l, vreinterpretq_u32_s8(hi)));
2534 uint16x4_t l = vqmovn_u32(vreinterpretq_u32_s8(lo));
2535 uint16x4_t h = vqmovn_u32(vreinterpretq_u32_s8(hi));
2536 return NLIB_CMB(u16, l, h);
2542 NLIB_M(i128) I128::ConvertFromInt32ToInt16Saturated(i128arg lo, i128arg hi)
NLIB_NOEXCEPT {
2543 #if defined(NLIB_SSE41) 2544 return _mm_packs_epi32(lo, hi);
2545 #elif defined(NLIB_NEON) 2547 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2548 return vreinterpretq_s8_s16(vqmovn_high_s32(l, vreinterpretq_s32_s8(hi)));
2550 int16x4_t l = vqmovn_s32(vreinterpretq_s32_s8(lo));
2551 int16x4_t h = vqmovn_s32(vreinterpretq_s32_s8(hi));
2552 return NLIB_CMB(s16, l, h);
2558 NLIB_M(i128) I128::ConvertFromInt8ToInt16Lo(i128arg value)
NLIB_NOEXCEPT {
2559 #if defined(NLIB_SSE41) 2560 return _mm_cvtepi8_epi16(value);
2561 #elif defined(NLIB_NEON) 2562 return vreinterpretq_s8_s16(vmovl_s8(vget_low_s8(value)));
2567 NLIB_M(i128) I128::ConvertFromInt8ToInt16Hi(i128arg value)
NLIB_NOEXCEPT {
2568 #if defined(NLIB_SSE41) 2569 return _mm_cvtepi8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2570 #elif defined(NLIB_NEON) 2572 int16x8_t result = vmovl_high_s8(value);
2574 int16x8_t result = vmovl_s8(vget_high_s8(value));
2576 return vreinterpretq_s8_s16(result);
2581 NLIB_M(i128) I128::ConvertFromInt16ToInt32Lo(i128arg value)
NLIB_NOEXCEPT {
2582 #if defined(NLIB_SSE41) 2583 return _mm_cvtepi16_epi32(value);
2584 #elif defined(NLIB_NEON) 2585 int16x8_t x = vreinterpretq_s16_s8(value);
2586 int32x4_t result = vmovl_s16(vget_low_s16(x));
2587 return vreinterpretq_s8_s32(result);
2592 NLIB_M(i128) I128::ConvertFromInt16ToInt32Hi(i128arg value)
NLIB_NOEXCEPT {
2593 #if defined(NLIB_SSE41) 2594 return _mm_cvtepi16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2595 #elif defined(NLIB_NEON) 2596 int16x8_t x = vreinterpretq_s16_s8(value);
2598 int32x4_t result = vmovl_high_s16(x);
2600 int32x4_t result = vmovl_s16(vget_high_s16(x));
2602 return vreinterpretq_s8_s32(result);
2607 NLIB_M(i128) I128::ConvertFromInt32ToInt64Lo(i128arg value)
NLIB_NOEXCEPT {
2608 #if defined(NLIB_SSE41) 2609 return _mm_cvtepi32_epi64(value);
2610 #elif defined(NLIB_NEON) 2611 int32x4_t x = vreinterpretq_s32_s8(value);
2612 int64x2_t result = vmovl_s32(vget_low_s32(x));
2613 return vreinterpretq_s8_s64(result);
2618 NLIB_M(i128) I128::ConvertFromInt32ToInt64Hi(i128arg value)
NLIB_NOEXCEPT {
2619 #if defined(NLIB_SSE41) 2620 return _mm_cvtepi32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2621 #elif defined(NLIB_NEON) 2622 int32x4_t x = vreinterpretq_s32_s8(value);
2624 int64x2_t result = vmovl_high_s32(x);
2626 int64x2_t result = vmovl_s32(vget_high_s32(x));
2628 return vreinterpretq_s8_s64(result);
2633 NLIB_M(i128) I128::ConvertFromUint8ToUint16Lo(i128arg value)
NLIB_NOEXCEPT {
2634 #if defined(NLIB_SSE41) 2635 return _mm_cvtepu8_epi16(value);
2636 #elif defined(NLIB_NEON) 2637 uint8x16_t x = vreinterpretq_u8_s8(value);
2638 uint16x8_t result = vmovl_u8(vget_low_u8(x));
2639 return vreinterpretq_s8_u16(result);
2644 NLIB_M(i128) I128::ConvertFromUint8ToUint16Hi(i128arg value)
NLIB_NOEXCEPT {
2645 #if defined(NLIB_SSE41) 2646 return _mm_cvtepu8_epi16(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2647 #elif defined(NLIB_NEON) 2648 uint8x16_t x = vreinterpretq_u8_s8(value);
2650 uint16x8_t result = vmovl_high_u8(x);
2652 uint16x8_t result = vmovl_u8(vget_high_u8(x));
2654 return vreinterpretq_s8_u16(result);
2659 NLIB_M(i128) I128::ConvertFromUint16ToUint32Lo(i128arg value)
NLIB_NOEXCEPT {
2660 #if defined(NLIB_SSE41) 2661 return _mm_cvtepu16_epi32(value);
2662 #elif defined(NLIB_NEON) 2663 uint16x8_t x = vreinterpretq_u16_s8(value);
2664 uint32x4_t result = vmovl_u16(vget_low_u16(x));
2665 return vreinterpretq_s8_u32(result);
2670 NLIB_M(i128) I128::ConvertFromUint16ToUint32Hi(i128arg value)
NLIB_NOEXCEPT {
2671 #if defined(NLIB_SSE41) 2672 return _mm_cvtepu16_epi32(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2673 #elif defined(NLIB_NEON) 2674 uint16x8_t x = vreinterpretq_u16_s8(value);
2676 uint32x4_t result = vmovl_high_u16(x);
2678 uint32x4_t result = vmovl_u16(vget_high_u16(x));
2680 return vreinterpretq_s8_u32(result);
2685 NLIB_M(i128) I128::ConvertFromUint32ToUint64Lo(i128arg value)
NLIB_NOEXCEPT {
2686 #if defined(NLIB_SSE41) 2687 return _mm_cvtepu32_epi64(value);
2688 #elif defined(NLIB_NEON) 2689 uint32x4_t x = vreinterpretq_u32_s8(value);
2690 uint64x2_t result = vmovl_u32(vget_low_u32(x));
2691 return vreinterpretq_s8_u64(result);
2696 NLIB_M(i128) I128::ConvertFromUint32ToUint64Hi(i128arg value)
NLIB_NOEXCEPT {
2697 #if defined(NLIB_SSE41) 2698 return _mm_cvtepu32_epi64(_mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
2699 #elif defined(NLIB_NEON) 2700 uint32x4_t x = vreinterpretq_u32_s8(value);
2702 uint64x2_t result = vmovl_high_u32(x);
2704 uint64x2_t result = vmovl_u32(vget_high_u32(x));
2706 return vreinterpretq_s8_u64(result);
2711 NLIB_M(i128) I128::Zip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2712 #if defined(NLIB_SSE41) 2713 return _mm_unpacklo_epi8(a, b);
2714 #elif defined(NLIB_NEON) 2716 return vzip1q_s8(a, b);
2718 return vzipq_s8(a, b).val[0];
2724 NLIB_M(i128) I128::Zip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2725 #if defined(NLIB_SSE41) 2726 return _mm_unpackhi_epi8(a, b);
2727 #elif defined(NLIB_NEON) 2729 return vzip2q_s8(a, b);
2731 return vzipq_s8(a, b).val[1];
2736 NLIB_M(i128) I128::Unzip8Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2737 #if defined(NLIB_SSE41) 2738 i128 mask = I128::SetValue(0x00FFU, each_uint16);
2739 __m128i lo_mask = _mm_and_si128(a, mask);
2740 __m128i hi_mask = _mm_and_si128(b, mask);
2741 return _mm_packus_epi16(lo_mask, hi_mask);
2742 #elif defined(NLIB_NEON) 2744 return vuzp1q_s8(a, b);
2746 return vuzpq_s8(a, b).val[0];
2751 NLIB_M(i128) I128::Unzip8Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2752 #if defined(NLIB_SSE41) 2753 i128 mask = I128::SetValue(0xFF00U, each_uint16);
2754 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 1);
2755 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 1);
2756 return _mm_packus_epi16(lo_mask, hi_mask);
2757 #elif defined(NLIB_NEON) 2759 return vuzp2q_s8(a, b);
2761 return vuzpq_s8(a, b).val[1];
2767 NLIB_M(i128) I128::Zip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2768 #if defined(NLIB_SSE41) 2769 return _mm_unpacklo_epi16(a, b);
2770 #elif defined(NLIB_NEON) 2772 return NLIB_OP2(vzip1q, u16, a, b);
2774 return vreinterpretq_s8_u16(vzipq_u16(
2775 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2781 NLIB_M(i128) I128::Zip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2782 #if defined(NLIB_SSE41) 2783 return _mm_unpackhi_epi16(a, b);
2784 #elif defined(NLIB_NEON) 2786 return NLIB_OP2(vzip2q, u16, a, b);
2788 return vreinterpretq_s8_u16(vzipq_u16(
2789 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2794 NLIB_M(i128) I128::Unzip16Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2795 #if defined(NLIB_SSE41) 2796 i128 mask = I128::SetValue(0xFFFFU, each_uint32);
2797 __m128i lo_mask = _mm_and_si128(a, mask);
2798 __m128i hi_mask = _mm_and_si128(b, mask);
2799 return _mm_packus_epi32(lo_mask, hi_mask);
2800 #elif defined(NLIB_NEON) 2802 return NLIB_OP2(vuzp1q, u16, a, b);
2804 return vreinterpretq_s8_u16(vuzpq_u16(
2805 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[0]);
2810 NLIB_M(i128) I128::Unzip16Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2811 #if defined(NLIB_SSE41) 2812 i128 mask = I128::SetValue(0xFFFF0000U, each_uint32);
2813 __m128i lo_mask = _mm_srli_si128(_mm_and_si128(a, mask), 2);
2814 __m128i hi_mask = _mm_srli_si128(_mm_and_si128(b, mask), 2);
2815 return _mm_packus_epi32(lo_mask, hi_mask);
2816 #elif defined(NLIB_NEON) 2818 return NLIB_OP2(vuzp2q, u16, a, b);
2820 return vreinterpretq_s8_u16(vuzpq_u16(
2821 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b)).val[1]);
2827 NLIB_M(i128) I128::Zip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2828 #if defined(NLIB_SSE41) 2829 return _mm_unpacklo_epi32(a, b);
2830 #elif defined(NLIB_NEON) 2832 return NLIB_OP2(vzip1q, u32, a, b);
2834 return vreinterpretq_s8_u32(vzipq_u32(
2835 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2841 NLIB_M(i128) I128::Zip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2842 #if defined(NLIB_SSE41) 2843 return _mm_unpackhi_epi32(a, b);
2844 #elif defined(NLIB_NEON) 2846 return NLIB_OP2(vzip2q, u32, a, b);
2848 return vreinterpretq_s8_u32(vzipq_u32(
2849 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2854 NLIB_M(i128) I128::Unzip32Lo(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2855 #if defined(NLIB_SSE41) 2856 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
2857 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
2858 return _mm_blend_epi16(x0, x1, 0xF0);
2859 #elif defined(NLIB_NEON) 2861 return NLIB_OP2(vuzp1q, u32, a, b);
2863 return vreinterpretq_s8_u32(vuzpq_u32(
2864 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0]);
2869 NLIB_M(i128) I128::Unzip32Hi(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2870 #if defined(NLIB_SSE41) 2871 __m128i x0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 3, 1));
2872 __m128i x1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 2, 0));
2873 return _mm_blend_epi16(x0, x1, 0xF0);
2874 #elif defined(NLIB_NEON) 2876 return NLIB_OP2(vuzp2q, u32, a, b);
2878 return vreinterpretq_s8_u32(vuzpq_u32(
2879 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[1]);
2884 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7,
2885 int V8,
int V9,
int V10,
int V11,
int V12,
int V13,
int V14,
int V15>
2886 NLIB_M(i128) I128::Permute8(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2887 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2888 return __builtin_shufflevector(
2890 V0, V1, V2, V3, V4, V5, V6, V7,
2891 V8, V9, V10, V11, V12, V13, V14, V15);
2892 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2893 return __builtin_shufflevector((__v16qi)a, (__v16qi)b,
2894 V0, V1, V2, V3, V4, V5, V6, V7,
2895 V8, V9, V10, V11, V12, V13, V14, V15);
2898 (V0 < 0 || V0 > 15) ? -128 : V0,
2899 (V1 < 0 || V1 > 15) ? -128 : V1,
2900 (V2 < 0 || V2 > 15) ? -128 : V2,
2901 (V3 < 0 || V3 > 15) ? -128 : V3,
2902 (V4 < 0 || V4 > 15) ? -128 : V4,
2903 (V5 < 0 || V5 > 15) ? -128 : V5,
2904 (V6 < 0 || V6 > 15) ? -128 : V6,
2905 (V7 < 0 || V7 > 15) ? -128 : V7,
2906 (V8 < 0 || V8 > 15) ? -128 : V8,
2907 (V9 < 0 || V9 > 15) ? -128 : V9,
2908 (V10 < 0 || V10 > 15) ? -128 : V10,
2909 (V11 < 0 || V11 > 15) ? -128 : V11,
2910 (V12 < 0 || V12 > 15) ? -128 : V12,
2911 (V13 < 0 || V13 > 15) ? -128 : V13,
2912 (V14 < 0 || V14 > 15) ? -128 : V14,
2913 (V15 < 0 || V15 > 15) ? -128 : V15
2916 V0 < 16 ? -128 : (V0 - 16),
2917 V1 < 16 ? -128 : (V1 - 16),
2918 V2 < 16 ? -128 : (V2 - 16),
2919 V3 < 16 ? -128 : (V3 - 16),
2920 V4 < 16 ? -128 : (V4 - 16),
2921 V5 < 16 ? -128 : (V5 - 16),
2922 V6 < 16 ? -128 : (V6 - 16),
2923 V7 < 16 ? -128 : (V7 - 16),
2924 V8 < 16 ? -128 : (V8 - 16),
2925 V9 < 16 ? -128 : (V9 - 16),
2926 V10 < 16 ? -128 : (V10 - 16),
2927 V11 < 16 ? -128 : (V11 - 16),
2928 V12 < 16 ? -128 : (V12 - 16),
2929 V13 < 16 ? -128 : (V13 - 16),
2930 V14 < 16 ? -128 : (V14 - 16),
2931 V15 < 16 ? -128 : (V15 - 16)
2933 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2934 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2935 return I128::Or(tmp_a, tmp_b);
2939 template<
int V0,
int V1,
int V2,
int V3,
int V4,
int V5,
int V6,
int V7>
2940 NLIB_M(i128) I128::Permute16(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2941 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2942 return vreinterpretq_s8_u16(__builtin_shufflevector(
2943 vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b),
2944 V0, V1, V2, V3, V4, V5, V6, V7));
2945 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2946 return __builtin_shufflevector((__v8hi)a, (__v8hi)b,
2947 V0, V1, V2, V3, V4, V5, V6, V7);
2950 (V0 < 0 || V0 > 7) ? -128 : V0 * 2,
2951 (V0 < 0 || V0 > 7) ? -128 : V0 * 2 + 1,
2952 (V1 < 0 || V1 > 7) ? -128 : V1 * 2,
2953 (V1 < 0 || V1 > 7) ? -128 : V1 * 2 + 1,
2954 (V2 < 0 || V2 > 7) ? -128 : V2 * 2,
2955 (V2 < 0 || V2 > 7) ? -128 : V2 * 2 + 1,
2956 (V3 < 0 || V3 > 7) ? -128 : V3 * 2,
2957 (V3 < 0 || V3 > 7) ? -128 : V3 * 2 + 1,
2958 (V4 < 0 || V4 > 7) ? -128 : V4 * 2,
2959 (V4 < 0 || V4 > 7) ? -128 : V4 * 2 + 1,
2960 (V5 < 0 || V5 > 7) ? -128 : V5 * 2,
2961 (V5 < 0 || V5 > 7) ? -128 : V5 * 2 + 1,
2962 (V6 < 0 || V6 > 7) ? -128 : V6 * 2,
2963 (V6 < 0 || V6 > 7) ? -128 : V6 * 2 + 1,
2964 (V7 < 0 || V7 > 7) ? -128 : V7 * 2,
2965 (V7 < 0 || V7 > 7) ? -128 : V7 * 2 + 1
2968 V0 < 8 ? -128 : (V0 - 8) * 2,
2969 V0 < 8 ? -128 : (V0 - 8) * 2 + 1,
2970 V1 < 8 ? -128 : (V1 - 8) * 2,
2971 V1 < 8 ? -128 : (V1 - 8) * 2 + 1,
2972 V2 < 8 ? -128 : (V2 - 8) * 2,
2973 V2 < 8 ? -128 : (V2 - 8) * 2 + 1,
2974 V3 < 8 ? -128 : (V3 - 8) * 2,
2975 V3 < 8 ? -128 : (V3 - 8) * 2 + 1,
2976 V4 < 8 ? -128 : (V4 - 8) * 2,
2977 V4 < 8 ? -128 : (V4 - 8) * 2 + 1,
2978 V5 < 8 ? -128 : (V5 - 8) * 2,
2979 V5 < 8 ? -128 : (V5 - 8) * 2 + 1,
2980 V6 < 8 ? -128 : (V6 - 8) * 2,
2981 V6 < 8 ? -128 : (V6 - 8) * 2 + 1,
2982 V7 < 8 ? -128 : (V7 - 8) * 2,
2983 V7 < 8 ? -128 : (V7 - 8) * 2 + 1
2985 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
2986 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
2987 return I128::Or(tmp_a, tmp_b);
2991 template<
int V0,
int V1,
int V2,
int V3>
2992 NLIB_M(i128) I128::Permute32(i128arg a, i128arg b)
NLIB_NOEXCEPT {
2993 #if __has_builtin(__builtin_shufflevector) && defined(NLIB_NEON) 2994 return vreinterpretq_s8_u32(__builtin_shufflevector(
2995 vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b),
2997 #elif __has_builtin(__builtin_shufflevector) && defined(NLIB_SSE41) 2998 return __builtin_shufflevector((__v4si)a, (__v4si)b,
3002 (V0 < 0 || V0 > 3) ? -128 : V0 * 4,
3003 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 1,
3004 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 2,
3005 (V0 < 0 || V0 > 3) ? -128 : V0 * 4 + 3,
3006 (V1 < 0 || V1 > 3) ? -128 : V1 * 4,
3007 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 1,
3008 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 2,
3009 (V1 < 0 || V1 > 3) ? -128 : V1 * 4 + 3,
3010 (V2 < 0 || V2 > 3) ? -128 : V2 * 4,
3011 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 1,
3012 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 2,
3013 (V2 < 0 || V2 > 3) ? -128 : V2 * 4 + 3,
3014 (V3 < 0 || V3 > 3) ? -128 : V3 * 4,
3015 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 1,
3016 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 2,
3017 (V3 < 0 || V3 > 3) ? -128 : V3 * 4 + 3
3020 V0 < 4 ? -128 : (V0 - 4) * 4,
3021 V0 < 4 ? -128 : (V0 - 4) * 4 + 1,
3022 V0 < 4 ? -128 : (V0 - 4) * 4 + 2,
3023 V0 < 4 ? -128 : (V0 - 4) * 4 + 3,
3024 V1 < 4 ? -128 : (V1 - 4) * 4,
3025 V1 < 4 ? -128 : (V1 - 4) * 4 + 1,
3026 V1 < 4 ? -128 : (V1 - 4) * 4 + 2,
3027 V1 < 4 ? -128 : (V1 - 4) * 4 + 3,
3028 V2 < 4 ? -128 : (V2 - 4) * 4,
3029 V2 < 4 ? -128 : (V2 - 4) * 4 + 1,
3030 V2 < 4 ? -128 : (V2 - 4) * 4 + 2,
3031 V2 < 4 ? -128 : (V2 - 4) * 4 + 3,
3032 V3 < 4 ? -128 : (V3 - 4) * 4,
3033 V3 < 4 ? -128 : (V3 - 4) * 4 + 1,
3034 V3 < 4 ? -128 : (V3 - 4) * 4 + 2,
3035 V3 < 4 ? -128 : (V3 - 4) * 4 + 3
3037 i128 tmp_a = I128::Shuffle8(a, I128::LoadA16(mask_a));
3038 i128 tmp_b = I128::Shuffle8(b, I128::LoadA16(mask_b));
3039 return I128::Or(tmp_a, tmp_b);
3046 #if defined(NLIB_SSE41) 3048 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
3050 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3051 #elif defined(NLIB_NEON) 3052 return NLIB_OP1(vrev16q, u8, value);
3058 #if defined(NLIB_SSE41) 3060 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
3062 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3063 #elif defined(NLIB_NEON) 3064 return NLIB_OP1(vrev32q, u8, value);
3070 #if defined(NLIB_SSE41) 3072 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
3074 return _mm_shuffle_epi8(value, *reinterpret_cast<const __m128i*>(&mask_[0]));
3075 #elif defined(NLIB_NEON) 3076 return NLIB_OP1(vrev64q, u8, value);
3082 #if defined(NLIB_SSE41) 3083 return _mm_movemask_epi8(value);
3084 #elif defined(NLIB_NEON) 3085 uint8x16_t powers = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL));
3086 uint8x16_t a = vandq_u8(value, powers);
3088 return vaddv_u8(vget_low_u8(a)) | (vaddv_u8(vget_high_u8(a)) << 8);
3090 uint8x8_t al = vget_low_u8(a);
3091 uint8x8_t ah = vget_high_u8(a);
3092 uint8x8_t tmp = vpadd_u8(al, ah);
3093 tmp = vpadd_u8(tmp, tmp);
3094 tmp = vpadd_u8(tmp, tmp);
3095 return vget_lane_u16(vreinterpret_u16_u8(tmp), 0);
3102 #if defined(NLIB_SSE41) 3103 __m128i tmp = _mm_packs_epi16(value, value);
3104 return _mm_movemask_epi8(tmp) & 255;
3105 #elif defined(NLIB_NEON) 3106 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3107 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3108 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3109 uint16x8_t a = vandq_u16(vreinterpretq_u16_s8(value), powers);
3111 return vaddvq_u16(a);
3113 uint8x8_t tmp = vmovn_u16(a);
3114 tmp = vpadd_u8(tmp, tmp);
3115 tmp = vpadd_u8(tmp, tmp);
3116 tmp = vpadd_u8(tmp, tmp);
3117 return vget_lane_u8(tmp, 0);
3124 #if defined(NLIB_SSE41) 3125 __m128i tmp = _mm_packs_epi16(value, value);
3126 tmp = _mm_packs_epi16(tmp, tmp);
3127 return _mm_movemask_epi8(tmp) & 15;
3128 #elif defined(NLIB_NEON) 3129 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3130 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3131 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3132 uint32x4_t a = vandq_u32(vreinterpretq_u32_s8(value), powers);
3134 return vaddvq_u32(a);
3136 uint16x4_t tmp = vmovn_u32(a);
3137 tmp = vpadd_u16(tmp, tmp);
3138 tmp = vpadd_u16(tmp, tmp);
3139 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
3145 #if defined(NLIB_NEON) 3146 int8x8_t m = vcreate_s8(0x8040201008040201ULL);
3147 int8x8_t s0 = vdup_n_s8(mask & 0xFF);
3148 int8x8_t s1 = vdup_n_s8(mask >> 8);
3149 return vtstq_s8(vcombine_s8(m, m), vcombine_s8(s0, s1));
3150 #elif defined(NLIB_SSE41) 3151 i128 m = I128::SetValue(0x8040201008040201ULL, each_uint64);
3152 i128 s0 = I128::SetValue(mask & 0xFF, each_int8);
3153 i128 s1 = I128::SetValue(static_cast<int8_t>(mask >> 8), each_int8);
3154 i128 s = _mm_blend_epi16(s0, s1, 0xF0);
3155 return I128::Test8(m, s);
3160 #if defined(NLIB_NEON) 3161 uint16x4_t powers_lo = vcreate_u16(0x0008000400020001ULL);
3162 uint16x4_t powers_hi = vshl_n_u16(powers_lo, 4);
3163 uint16x8_t powers = vcombine_u16(powers_lo, powers_hi);
3164 uint16x8_t s = vdupq_n_u16(mask);
3165 return vreinterpretq_s8_u16(vtstq_u16(powers, s));
3166 #elif defined(NLIB_SSE41) 3167 i128 m0 = I128::SetValue(0x0008000400020001ULL, each_uint64);
3168 i128 m1 = I128::SetValue(0x0080004000200010ULL, each_uint64);
3169 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3170 i128 s = I128::SetValue(static_cast<int16_t>(mask), each_int16);
3171 return I128::Test16(m, s);
3176 #if defined(NLIB_NEON) 3177 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
3178 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
3179 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
3180 uint32x4_t s = vdupq_n_u32(mask);
3181 return vreinterpretq_s8_u32(vtstq_u32(powers, s));
3182 #elif defined(NLIB_SSE41) 3183 i128 m0 = I128::SetValue(0x0000000200000001ULL, each_uint64);
3184 i128 m1 = I128::SetValue(0x0000000800000004ULL, each_uint64);
3185 i128 m = _mm_blend_epi16(m0, m1, 0xF0);
3186 i128 s = I128::SetValue(mask, each_int32);
3187 return I128::Test32(m, s);
3193 #if defined(NLIB_SSE41) 3194 return _mm_testz_si128(value, value) != 0;
3195 #elif defined(NLIB_NEON) 3197 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(value));
3198 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3202 int8x8_t tmp = vorr_s8(vget_low_s8(value), vget_high_s8(value));
3203 return vget_lane_u64(vreinterpret_u64_s8(tmp), 0) == 0;
3210 #if defined(NLIB_SSE41) 3211 return _mm_testc_si128(value, _mm_cmpeq_epi8(value, value)) != 0;
3212 #elif defined(NLIB_NEON) 3214 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_s8(vmvnq_s8(value)));
3215 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
3217 int8x8_t tmp = vand_s8(vget_low_s8(value), vget_high_s8(value));
3218 return vget_lane_s64(vreinterpret_s64_s8(tmp), 0) == -1;
3224 NLIB_M(i128) I128::Select(i128arg mask, i128arg a, i128arg b)
NLIB_NOEXCEPT {
3225 #if defined(NLIB_SSE41) 3226 return _mm_blendv_epi8(b, a, mask);
3227 #elif defined(NLIB_NEON) 3228 return NLIB_OP3(vbslq, u32, mask, a, b);
3233 NLIB_M(i128) I128::Shuffle8(i128arg value, i128arg shuffle)
NLIB_NOEXCEPT {
3234 #if defined(NLIB_SSE41) 3235 return _mm_shuffle_epi8(value, shuffle);
3236 #elif defined(NLIB_NEON) 3238 return vqtbl1q_s8(value, vreinterpretq_u8_s8(shuffle));
3241 x.val[0] = vget_low_s8(value);
3242 x.val[1] = vget_high_s8(value);
3243 int8x8_t lo = vtbl2_s8(x, vget_low_s8(shuffle));
3244 int8x8_t hi = vtbl2_s8(x, vget_high_s8(shuffle));
3245 return vcombine_s8(lo, hi);
3252 #if defined(NLIB_NEON) 3254 int8x16_t tmp = vnegq_s8(value);
3255 return vaddvq_s8(tmp);
3257 int8x16_t tmp = vnegq_s8(value);
3258 int8x8_t lo = vget_low_s8(tmp);
3259 int8x8_t hi = vget_high_s8(tmp);
3260 lo = vadd_s8(lo, hi);
3261 lo = vpadd_s8(lo, lo);
3262 lo = vpadd_s8(lo, lo);
3263 lo = vpadd_s8(lo, lo);
3264 return vget_lane_s8(lo, 0);
3267 return nlib_popcnt16(static_cast<uint16_t>(I128::MoveMask8(value)));
3272 return nlib_clz(static_cast<uint32_t>(I128::MoveMask8(value))) - 16;
3276 return nlib_ctz(static_cast<uint32_t>(I128::MoveMask8(value) | 0x10000));
3280 # undef vreinterpretq_s8_s8 3289 #endif // NLIB_DOXYGEN 3294 #if defined(NLIB_SSE41) 3295 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3297 row0 = _mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 1, 2, 0)); \ 3298 row1 = _mm_shuffle_epi32(row1, _MM_SHUFFLE(3, 1, 2, 0)); \ 3299 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(3, 1, 2, 0)); \ 3300 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(3, 1, 2, 0)); \ 3301 __m128i t0_transpose32_ = _mm_unpacklo_epi32(row0, row1); \ 3302 __m128i t1_transpose32_ = _mm_unpackhi_epi32(row0, row1); \ 3303 __m128i t2_transpose32_ = _mm_unpacklo_epi32(row2, row3); \ 3304 __m128i t3_transpose32_ = _mm_unpackhi_epi32(row2, row3); \ 3305 row0 = _mm_unpacklo_epi64(t0_transpose32_, t2_transpose32_); \ 3306 row1 = _mm_unpacklo_epi64(t1_transpose32_, t3_transpose32_); \ 3307 row2 = _mm_unpackhi_epi64(t0_transpose32_, t2_transpose32_); \ 3308 row3 = _mm_unpackhi_epi64(t1_transpose32_, t3_transpose32_); \ 3310 #elif defined(NLIB_NEON) 3312 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3314 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \ 3315 vreinterpretq_u32_s8(row1)); \ 3316 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \ 3317 vreinterpretq_u32_s8(row3)); \ 3318 uint64x2_t row0_, row1_, row2_, row3_; \ 3319 row0_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3320 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3321 row0 = vreinterpretq_s8_u64(row0_); \ 3322 row1_ = vtrn1q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3323 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3324 row1 = vreinterpretq_s8_u64(row1_); \ 3325 row2_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[0]), \ 3326 vreinterpretq_u64_u32(trn_f1_.val[0])); \ 3327 row2 = vreinterpretq_s8_u64(row2_); \ 3328 row3_ = vtrn2q_u64(vreinterpretq_u64_u32(trn_f0_.val[1]), \ 3329 vreinterpretq_u64_u32(trn_f1_.val[1])); \ 3330 row3 = vreinterpretq_s8_u64(row3_); \ 3333 #define NLIB_I128_TRANSPOSE32(row0, row1, row2, row3) \ 3335 uint32x4x2_t trn_f0_ = vtrnq_u32(vreinterpretq_u32_s8(row0), \ 3336 vreinterpretq_u32_s8(row1)); \ 3337 uint32x4x2_t trn_f1_ = vtrnq_u32(vreinterpretq_u32_s8(row2), \ 3338 vreinterpretq_u32_s8(row3)); \ 3339 uint32x4_t row0_, row1_, row2_, row3_; \ 3340 uint32x2_t lo, hi; \ 3341 lo = vget_low_u32(trn_f0_.val[0]); hi = vget_low_u32(trn_f1_.val[0]); \ 3342 row0_ = vcombine_u32(lo, hi); \ 3343 row0 = vreinterpretq_s8_u32(row0_); \ 3344 lo = vget_low_u32(trn_f0_.val[1]); hi = vget_low_u32(trn_f1_.val[1]); \ 3345 row1_ = vcombine_u32(lo, hi); \ 3346 row1 = vreinterpretq_s8_u32(row1_); \ 3347 lo = vget_high_u32(trn_f0_.val[0]); hi = vget_high_u32(trn_f1_.val[0]); \ 3348 row2_ = vcombine_u32(lo, hi); \ 3349 row2 = vreinterpretq_s8_u32(row2_); \ 3350 lo = vget_high_u32(trn_f0_.val[1]); hi = vget_high_u32(trn_f1_.val[1]); \ 3351 row3_ = vcombine_u32(lo, hi); \ 3352 row3 = vreinterpretq_s8_u32(row3_); \ 3362 #endif // INCLUDE_NN_NLIB_SIMD_SIMDINT_H_ 空の構造体で32bitの符号付き整数を示すためのタグです。
空の構造体で64bitの符号付き整数を示すためのタグです。
constexpr const each_uint8_tag each_uint8
each_uint8_tag型の定数オブジェクトで、8bitの符号なし整数を示すためのタグです。
空の構造体で8bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
空の構造体で8bitの符号付き整数を示すためのタグです。
constexpr const each_uint16_tag each_uint16
each_uint16_tag型の定数オブジェクトで、16bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号なし整数を示すためのタグです。
空の構造体で64bitの符号なし整数を示すためのタグです。
constexpr const each_int64_tag each_int64
each_int64_tag型の定数オブジェクトで、64bitの符号付き整数を示すためのタグです。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
constexpr const each_uint64_tag each_uint64
each_uint64_tag型の定数オブジェクトで、64bitの符号なし整数を示すためのタグです。
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いた整数SIMD演算を行うためのクラスです。 ...
空の構造体で8bitの符号なし整数を示すためのタグです。
空の構造体で16bitの符号付き整数を示すためのタグです。
constexpr const each_int16_tag each_int16
each_int16_tag型の定数オブジェクトで、16bitの符号付き整数を示すためのタグです。
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
空の構造体で16bit単位に分けたレーンを選択することを示すためのタグです。
constexpr const each_select16_tag each_select16
each_select16_tag型の定数オブジェクトで、16bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
__m128i nlib_i128_t
128bitの整数用SIMDレジスタのための型です。
constexpr const each_select8_tag each_select8
each_select8_tag型の定数オブジェクトで、8bitのレーンを選択することを示すためのタグです。 ...
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
空の構造体で32bitの符号なし整数を示すためのタグです。
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。