19#ifndef __LAMMP_LONGLONG_H__
20#define __LAMMP_LONGLONG_H__
25#elif defined(USE_ASM) && (defined(__x86_64__)) && (defined(__GNUC__) || defined(__clang__))
31static inline void _umul64to128_(uint64_t a, uint64_t b, uint64_t *low, uint64_t *high) {
32#if (defined(__GNUC__) || defined(__clang__))
33#if defined(USE_ASM) && (defined(__x86_64__))
41 __uint128_t prod = (__uint128_t)a * b;
42 *low = (uint64_t)prod;
43 *high = (uint64_t)(prod >> 64);
45#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
46 *low = _umul128(a, b, high);
48 uint64_t ah = a >> 32, bh = b >> 32;
49 a = (uint32_t)a, b = (uint32_t)b;
50 uint64_t
r0 = a * b,
r1 = a * bh,
r2 = ah * b,
r3 = ah * bh;
51 r3 += (
r1 >> 32) + (
r2 >> 32);
52 r1 = (uint32_t)
r1,
r2 = (uint32_t)
r2;
55 *high =
r3 + (
r1 >> 32);
56 *low = (
r1 << 32) | (uint32_t)
r0;
61#if (defined(__GNUC__) || defined(__clang__)) && defined(__SIZEOF_INT128__)
62 __uint128_t t = (__uint128_t)a * (__uint128_t)b;
63 return (uint64_t)(t >> 64);
64#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
67 uint64_t ah = a >> 32, bh = b >> 32;
68 a = (uint32_t)a, b = (uint32_t)b;
69 uint64_t
r0 = a * b,
r1 = a * bh,
r2 = ah * b,
r3 = ah * bh;
70 r3 += (
r1 >> 32) + (
r2 >> 32);
71 r1 = (uint32_t)
r1,
r2 = (uint32_t)
r2;
74 return r3 + (
r1 >> 32);
78static inline void _umulx64to128_(uint64_t a, uint64_t b, uint64_t* low, uint64_t* high) {
79#if defined(USE_ASM) && (defined(__x86_64__))
80 *low = _mulx_u64(a, b, high);
86static inline void _umul128to256_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[4]) {
87 uint64_t p1_low, p1_high;
88 uint64_t p2_low, p2_high;
101 uint64_t carry = (rr[1] < p1_low) ? 1 : 0;
103 carry += (rr[1] < p2_low) ? 1 : 0;
106 carry = (rr[2] < carry) ? 1 : 0;
108 carry += (rr[2] < p1_high) ? 1 : 0;
110 carry += (rr[2] < p2_high) ? 1 : 0;
115static inline void _umul128to128_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[2]) {
117 rr[1] += a_low * b_high;
118 rr[1] += a_high * b_low;
122#if defined(__GNUC__) || defined(__clang__)
124 return __builtin_clzll(val);
127 _BitScanReverse64(&cnt, val);
132static inline uint64_t
_udiv128by64to64_(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t* r) {
133#if defined(__GNUC__) || defined(__clang__)
136 __asm__(
"div %[v]" :
"=a"(result),
"=d"(*r) : [v]
"r"(den),
"a"(numlo),
"d"(numhi));
139 __uint128_t num = (__uint128_t)numhi << 64 | numlo;
140 uint64_t result = num / den;
145 const uint64_t b = ((uint64_t)1 << 32);
154 uint64_t den10 = den;
155 uint64_t num10 = numlo;
179 numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);
182 num1 = (uint32_t)(numlo >> 32);
183 num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
184 den1 = (uint32_t)(den >> 32);
185 den0 = (uint32_t)(den & 0xFFFFFFFFu);
190 c2 = rhat * b + num1;
192 qhat -= (
c1 - c2 > den) ? 2 : 1;
195 rem = numhi * b + num1 - q1 * den;
200 c2 = rhat * b + num0;
202 qhat -= (
c1 - c2 > den) ? 2 : 1;
205 q = ((uint64_t)q1 << 32) | q0;
208 *r = num10 - q * den10;
217#define ctz_shl(r, x, cnt) \
219 unsigned long long _x_ = (x); \
220 unsigned long _bits_ = 0; \
221 _BitScanForward64(&_bits_, _x_); \
223 (r) = _x_ >> (cnt); \
229#define ctz_shl(r, x, cnt) \
231 unsigned long long _x_ = (x); \
232 (cnt) = __builtin_ctzll(_x_); \
233 (r) = _x_ >> (cnt); \
244#define _u128lshl(x, y, n) \
246 (*((x) + 1)) = ((*(y)) >> (64 - (n))) | ((*((y) + 1)) << (n)); \
247 (*(x)) = (*(y)) << (n); \
250#define _u128lshr(x, y, n) \
252 (*(x)) = ((*(y)) >> (n)) | ((*((y) + 1)) << (64 - (n))); \
253 (*((x) + 1)) = (*((y) + 1)) >> (n); \
256#define _u128high(x) (*((x) + 1))
258#define _u128low(x) (*(x))
260#define _u128add(r, x, y) \
262 (*(r)) = *(x) + *(y); \
263 (*((r) + 1)) = (*((x) + 1)) + (*((y) + 1)) + ((*(r)) < (*(y)) ? 1 : 0); \
266#define _u128add64(r, x, _i64) \
268 (*(r)) = *(x) + (_i64); \
269 (*((r) + 1)) = (*((x) + 1)) + (((*(r)) < (_i64)) ? 1 : 0); \
272#define _u128sub64(r, x, _i64) \
274 uint64_t _c_ = (x)[0] < (_i64); \
275 (r)[0] = (x)[0] - (_i64); \
276 (r)[1] = (x)[1] - _c_; \
280#define _u128cmp(x, y) ((x)[1] < (y)[1] || ((x)[1] == (y)[1] && (x)[0] < (y)[0]))
282#define _u128sub(r, x, y) \
284 uint64_t _c_ = (x)[0] < (y)[0]; \
285 (r)[0] = (x)[0] - (y)[0]; \
286 (r)[1] = (x)[1] - (y)[1] - _c_; \
289#define _u128mul(r, x, y) _umul64to128_((x), (y), (r), (((r) + 1)))
291#define _mont64_add(r, x, y, mod2) \
294 (r) = ((r) < (mod2)) ? (r) : ((r) - (mod2)); \
297#define _mont64_sub(r, x, y, mod2) \
299 (r) = (((x) - (y)) > (x)) ? (((x) - (y)) + (mod2)) : ((x) - (y)); \
302#define _raw64_add(r, x, y) \
307#define _raw64_sub(r, x, y, mod2) \
309 (r) = (x) - (y) + (mod2); \
312#define _mont64_norm2(r, x, mod2) \
314 (r) = (x) >= (mod2) ? ((x) - (mod2)) : (x); \
324#define _mont64_mul(r, x, y, mod, modInvNeg) \
326 u128 _tmp1_ = {0, 0}; \
327 _u128mul(_tmp1_, (x), (y)); \
328 u128 _tmp2_ = {0, 0}; \
329 (*_tmp2_) = (*(_tmp1_)) * (modInvNeg); \
330 _u128mul(_tmp2_, (*(_tmp2_)), (mod)); \
331 _u128add(_tmp2_, _tmp2_, _tmp1_); \
332 (r) = (*(_tmp2_ + 1)); \
346#define _mont64_mulinto(x, y, mod, modInvNeg) \
348 u128 _tmp1_ = {0, 0}; \
349 _u128mul(_tmp1_, (x), (y)); \
350 u128 _tmp2_ = {0, 0}; \
351 *(_tmp2_) = (*(_tmp1_)) * (modInvNeg); \
352 _u128mul(_tmp2_, *(_tmp2_), (mod)); \
353 _u128add(_tmp2_, _tmp2_, _tmp1_); \
354 (x) = (*(_tmp2_ + 1) < (mod)) ? (*(_tmp2_ + 1)) : ((*(_tmp2_ + 1)) - (mod)); \
357#define _mont64_tomont(x, r2, mod, modInvNeg) \
359 u128 _tmp1_ = {0, 0}; \
360 _u128mul(_tmp1_, (x), (r2)); \
361 u128 _tmp2_ = {0, 0}; \
362 *(_tmp2_) = (*(_tmp1_)) * (modInvNeg); \
363 _u128mul(_tmp2_, (*(_tmp2_)), (mod)); \
364 _u128add(_tmp2_, _tmp2_, _tmp1_); \
365 (x) = ((*(_tmp2_ + 1)) < (mod)) ? (*(_tmp2_ + 1)) : ((*(_tmp2_ + 1)) - (mod)); \
368#define _mont64_toint(x, mod, modInvneg) \
370 u128 _tmp_ = {0, 0}; \
371 *_tmp_ = (x) * (modInvneg); \
372 _u128mul(_tmp_, (*_tmp_), (mod)); \
373 _u128add64(_tmp_, _tmp_, x); \
374 (x) = (*(_tmp_ + 1) < (mod)) ? (*(_tmp_ + 1)) : ((*(_tmp_ + 1)) - (mod)); \
377#define _u128x64to192(i192, i128, i64) \
379 _umul64to128_((*(i128)), i64, (i192), ((i192) + 1)); \
381 _umul64to128_((*((i128) + 1)), i64, (&(_tmp_)), ((i192) + 2)); \
382 (*((i192) + 1)) += _tmp_; \
383 (*((i192) + 2)) += ((*((i192) + 1)) < _tmp_) ? 1 : 0; \
386#define _u192add(i192, j192) \
388 (*(i192)) += (*(j192)); \
389 uint64_t _c_ = ((*(i192)) < (*j192)) ? 1 : 0; \
390 (*((i192) + 1)) += _c_; \
391 _c_ = ((*((i192) + 1)) < _c_) ? 1 : 0; \
392 (*((i192) + 1)) += (*((j192) + 1)); \
393 _c_ += ((*((i192) + 1)) < (*((j192) + 1))) ? 1 : 0; \
394 (*((i192) + 2)) += _c_ + (*((j192) + 2)); \
397#define _u192sub(i192, j192) \
399 uint64_t _b_ = ((i192)[0] < (j192)[0]) ? 1 : 0; \
400 (i192)[0] -= (j192)[0]; \
401 uint64_t _b1_ = ((i192)[1] < (j192)[1]) ? 1 : 0; \
402 (i192)[1] -= (j192)[1]; \
403 _b1_ += ((i192)[1] < _b_) ? 1 : 0; \
405 (i192)[2] = (i192)[2] - ((j192)[2] + _b1_); \
408#define _add_ssaaaa(sh, sl, ah, al, bh, bl) \
412 (sh) = (ah) + (bh) + (_x_ < (al)); \
416#define _sub_ddmmss(sh, sl, ah, al, bh, bl) \
420 (sh) = (ah) - (bh) - ((al) < (bl)); \
424#define _udiv_qrnnd_preinv(q, r, nh, nl, d, di) \
426 uint64_t _qh_, _ql_, _r_, _mask_; \
427 _umul64to128_((nh), (di), &_ql_, &_qh_); \
428 _add_ssaaaa(_qh_, _ql_, _qh_, _ql_, (nh) + 1, (nl)); \
429 _r_ = (nl) - _qh_ * (d); \
430 _mask_ = -(mp_limb_t)(_r_ > _ql_); \
432 _r_ += _mask_ & (d); \
441#define _udiv_qr_3by2(q, r1, r0, n2, n1, n0, d1, d0, dinv) \
443 mp_limb_t _q0_, _t1_, _t0_, _mask_; \
444 _umul64to128_((n2), (dinv), &_q0_, &(q)); \
445 _add_ssaaaa((q), _q0_, (q), _q0_, (n2), (n1)); \
447 (r1) = (n1) - (d1) * (q); \
448 _sub_ddmmss((r1), (r0), (r1), (n0), (d1), (d0)); \
449 _umul64to128_((d0), (q), &_t0_, &_t1_); \
450 _sub_ddmmss((r1), (r0), (r1), (r0), _t1_, _t0_); \
453 _mask_ = -(uint64_t)((r1) >= _q0_); \
455 _add_ssaaaa((r1), (r0), (r1), (r0), _mask_ & (d1), _mask_ & (d0)); \
456 if ((r1) >= (d1)) { \
457 if ((r1) > (d1) || (r0) >= (d0)) { \
459 _sub_ddmmss((r1), (r0), (r1), (r0), (d1), (d0)); \
466#define _udiv32by32_q_preinv(q, n0, dinv) \
468 uint64_t _hi_, _lo_; \
469 _umul64to128_((n0), (dinv), &_lo_, &_hi_); \
477#define _U64_SHIFT_MASK 0x3F
478#define _ADD_MARKER 0x40
488 uint32_t floor_log_2_d = 63 -
_clz_u64_(d);
491 if ((d & (d - 1)) == 0) {
497 result.
more = (uint8_t)(floor_log_2_d - (branchfree != 0));
499 uint64_t proposed_m, rem;
504 const uint64_t e = d - rem;
507 if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
509 more = (uint8_t)floor_log_2_d;
516 proposed_m += proposed_m;
517 const uint64_t twice_rem = rem + rem;
518 if (twice_rem >= d || twice_rem < rem)
522 result.
magic = 1 + proposed_m;
543 uint64_t t = ((numer - q) >> 1) + q;
544 return t >> denom->
more;
static uint64_t _umul64to64hi_(uint64_t a, uint64_t b)
#define _U64_SHIFT_MASK
from https://libdivide.com/
static _udiv64_t _udiv64_gen_internal_(uint64_t d, int branchfree)
static void _umul64to128_(uint64_t a, uint64_t b, uint64_t *low, uint64_t *high)
static void _umul128to256_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[4])
static _udiv64_t _udiv64_gen(uint64_t d)
static int32_t _clz_u64_(uint64_t val)
static void _umul128to128_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[2])
static uint64_t _udiv64by64_q_preinv(uint64_t numer, const _udiv64_t *denom)
static uint64_t _udiv128by64to64_(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r)
uint64_t u128[2]
请注意,此处的蒙哥马利域的R为2^64,p不可超过2^63-1
static void _umulx64to128_(uint64_t a, uint64_t b, uint64_t *low, uint64_t *high)