diff options
Diffstat (limited to 'lib/quickjs/libbf.c')
-rw-r--r-- | lib/quickjs/libbf.c | 8466 |
1 files changed, 8466 insertions, 0 deletions
diff --git a/lib/quickjs/libbf.c b/lib/quickjs/libbf.c new file mode 100644 index 00000000..fe1628e7 --- /dev/null +++ b/lib/quickjs/libbf.c @@ -0,0 +1,8466 @@ +/* + * Tiny arbitrary precision floating point library + * + * Copyright (c) 2017-2021 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> +#include <math.h> +#include <string.h> +#include <assert.h> + +#ifdef __AVX2__ +#include <immintrin.h> +#endif + +#include "cutils.h" +#include "libbf.h" + +/* enable it to check the multiplication result */ +//#define USE_MUL_CHECK +/* enable it to use FFT/NTT multiplication */ +#define USE_FFT_MUL +/* enable decimal floating point support */ +#define USE_BF_DEC + +//#define inline __attribute__((always_inline)) + +#ifdef __AVX2__ +#define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */ +#else +#define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */ +#endif + +/* XXX: adjust */ +#define DIVNORM_LARGE_THRESHOLD 50 +#define UDIV1NORM_THRESHOLD 3 + +#if LIMB_BITS == 64 +#define FMT_LIMB1 "%" PRIx64 +#define FMT_LIMB "%016" PRIx64 +#define PRId_LIMB PRId64 +#define PRIu_LIMB PRIu64 + +#else + +#define FMT_LIMB1 "%x" +#define FMT_LIMB "%08x" +#define PRId_LIMB "d" +#define PRIu_LIMB "u" + +#endif + +typedef intptr_t mp_size_t; + +typedef int bf_op2_func_t(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags); + +#ifdef USE_FFT_MUL + +#define FFT_MUL_R_OVERLAP_A (1 << 0) +#define FFT_MUL_R_OVERLAP_B (1 << 1) +#define FFT_MUL_R_NORESIZE (1 << 2) + +static no_inline int fft_mul(bf_context_t *s, + bf_t *res, limb_t *a_tab, limb_t a_len, + limb_t *b_tab, limb_t b_len, int mul_flags); +static void fft_clear_cache(bf_context_t *s); +#endif +#ifdef USE_BF_DEC +static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos); +#endif + + +/* could leading zeros */ +static inline int clz(limb_t a) +{ + if (a == 0) { + return LIMB_BITS; + } else { +#if LIMB_BITS == 64 + return clz64(a); +#else + return clz32(a); +#endif + } +} + +static inline int ctz(limb_t a) +{ + if (a == 0) { + return LIMB_BITS; + } else { +#if LIMB_BITS == 64 + return ctz64(a); +#else + return ctz32(a); +#endif + } +} + +static inline int ceil_log2(limb_t a) +{ + if (a <= 1) + return 0; + else + return LIMB_BITS - clz(a - 1); +} + +/* b must be >= 1 */ +static inline slimb_t ceil_div(slimb_t a, slimb_t b) +{ + if (a >= 0) + return (a + b - 1) / b; + else + return a / b; +} + +/* b must be >= 1 */ +static inline slimb_t floor_div(slimb_t a, slimb_t b) +{ + if (a >= 0) { + return a / b; + } else { + return (a - b + 1) / b; + } +} + +/* return r = a modulo b (0 <= r <= b - 1. b must be >= 1 */ +static inline limb_t smod(slimb_t a, slimb_t b) +{ + a = a % (slimb_t)b; + if (a < 0) + a += b; + return a; +} + +/* signed addition with saturation */ +static inline slimb_t sat_add(slimb_t a, slimb_t b) +{ + slimb_t r; + r = a + b; + /* overflow ? */ + if (((a ^ r) & (b ^ r)) < 0) + r = (a >> (LIMB_BITS - 1)) ^ (((limb_t)1 << (LIMB_BITS - 1)) - 1); + return r; +} + +#define malloc(s) malloc_is_forbidden(s) +#define free(p) free_is_forbidden(p) +#define realloc(p, s) realloc_is_forbidden(p, s) + +void bf_context_init(bf_context_t *s, bf_realloc_func_t *realloc_func, + void *realloc_opaque) +{ + memset(s, 0, sizeof(*s)); + s->realloc_func = realloc_func; + s->realloc_opaque = realloc_opaque; +} + +void bf_context_end(bf_context_t *s) +{ + bf_clear_cache(s); +} + +void bf_init(bf_context_t *s, bf_t *r) +{ + r->ctx = s; + r->sign = 0; + r->expn = BF_EXP_ZERO; + r->len = 0; + r->tab = NULL; +} + +/* return 0 if OK, -1 if alloc error */ +int bf_resize(bf_t *r, limb_t len) +{ + limb_t *tab; + + if (len != r->len) { + tab = bf_realloc(r->ctx, r->tab, len * sizeof(limb_t)); + if (!tab && len != 0) + return -1; + r->tab = tab; + r->len = len; + } + return 0; +} + +/* return 0 or BF_ST_MEM_ERROR */ +int bf_set_ui(bf_t *r, uint64_t a) +{ + r->sign = 0; + if (a == 0) { + r->expn = BF_EXP_ZERO; + bf_resize(r, 0); /* cannot fail */ + } +#if LIMB_BITS == 32 + else if (a <= 0xffffffff) +#else + else +#endif + { + int shift; + if (bf_resize(r, 1)) + goto fail; + shift = clz(a); + r->tab[0] = a << shift; + r->expn = LIMB_BITS - shift; + } +#if LIMB_BITS == 32 + else { + uint32_t a1, a0; + int shift; + if (bf_resize(r, 2)) + goto fail; + a0 = a; + a1 = a >> 32; + shift = clz(a1); + r->tab[0] = a0 << shift; + r->tab[1] = (a1 << shift) | (a0 >> (LIMB_BITS - shift)); + r->expn = 2 * LIMB_BITS - shift; + } +#endif + return 0; + fail: + bf_set_nan(r); + return BF_ST_MEM_ERROR; +} + +/* return 0 or BF_ST_MEM_ERROR */ +int bf_set_si(bf_t *r, int64_t a) +{ + int ret; + + if (a < 0) { + ret = bf_set_ui(r, -a); + r->sign = 1; + } else { + ret = bf_set_ui(r, a); + } + return ret; +} + +void bf_set_nan(bf_t *r) +{ + bf_resize(r, 0); /* cannot fail */ + r->expn = BF_EXP_NAN; + r->sign = 0; +} + +void bf_set_zero(bf_t *r, int is_neg) +{ + bf_resize(r, 0); /* cannot fail */ + r->expn = BF_EXP_ZERO; + r->sign = is_neg; +} + +void bf_set_inf(bf_t *r, int is_neg) +{ + bf_resize(r, 0); /* cannot fail */ + r->expn = BF_EXP_INF; + r->sign = is_neg; +} + +/* return 0 or BF_ST_MEM_ERROR */ +int bf_set(bf_t *r, const bf_t *a) +{ + if (r == a) + return 0; + if (bf_resize(r, a->len)) { + bf_set_nan(r); + return BF_ST_MEM_ERROR; + } + r->sign = a->sign; + r->expn = a->expn; + memcpy(r->tab, a->tab, a->len * sizeof(limb_t)); + return 0; +} + +/* equivalent to bf_set(r, a); bf_delete(a) */ +void bf_move(bf_t *r, bf_t *a) +{ + bf_context_t *s = r->ctx; + if (r == a) + return; + bf_free(s, r->tab); + *r = *a; +} + +static limb_t get_limbz(const bf_t *a, limb_t idx) +{ + if (idx >= a->len) + return 0; + else + return a->tab[idx]; +} + +/* get LIMB_BITS at bit position 'pos' in tab */ +static inline limb_t get_bits(const limb_t *tab, limb_t len, slimb_t pos) +{ + limb_t i, a0, a1; + int p; + + i = pos >> LIMB_LOG2_BITS; + p = pos & (LIMB_BITS - 1); + if (i < len) + a0 = tab[i]; + else + a0 = 0; + if (p == 0) { + return a0; + } else { + i++; + if (i < len) + a1 = tab[i]; + else + a1 = 0; + return (a0 >> p) | (a1 << (LIMB_BITS - p)); + } +} + +static inline limb_t get_bit(const limb_t *tab, limb_t len, slimb_t pos) +{ + slimb_t i; + i = pos >> LIMB_LOG2_BITS; + if (i < 0 || i >= len) + return 0; + return (tab[i] >> (pos & (LIMB_BITS - 1))) & 1; +} + +static inline limb_t limb_mask(int start, int last) +{ + limb_t v; + int n; + n = last - start + 1; + if (n == LIMB_BITS) + v = -1; + else + v = (((limb_t)1 << n) - 1) << start; + return v; +} + +static limb_t mp_scan_nz(const limb_t *tab, mp_size_t n) +{ + mp_size_t i; + for(i = 0; i < n; i++) { + if (tab[i] != 0) + return 1; + } + return 0; +} + +/* return != 0 if one bit between 0 and bit_pos inclusive is not zero. */ +static inline limb_t scan_bit_nz(const bf_t *r, slimb_t bit_pos) +{ + slimb_t pos; + limb_t v; + + pos = bit_pos >> LIMB_LOG2_BITS; + if (pos < 0) + return 0; + v = r->tab[pos] & limb_mask(0, bit_pos & (LIMB_BITS - 1)); + if (v != 0) + return 1; + pos--; + while (pos >= 0) { + if (r->tab[pos] != 0) + return 1; + pos--; + } + return 0; +} + +/* return the addend for rounding. Note that prec can be <= 0 (for + BF_FLAG_RADPNT_PREC) */ +static int bf_get_rnd_add(int *pret, const bf_t *r, limb_t l, + slimb_t prec, int rnd_mode) +{ + int add_one, inexact; + limb_t bit1, bit0; + + if (rnd_mode == BF_RNDF) { + bit0 = 1; /* faithful rounding does not honor the INEXACT flag */ + } else { + /* starting limb for bit 'prec + 1' */ + bit0 = scan_bit_nz(r, l * LIMB_BITS - 1 - bf_max(0, prec + 1)); + } + + /* get the bit at 'prec' */ + bit1 = get_bit(r->tab, l, l * LIMB_BITS - 1 - prec); + inexact = (bit1 | bit0) != 0; + + add_one = 0; + switch(rnd_mode) { + case BF_RNDZ: + break; + case BF_RNDN: + if (bit1) { + if (bit0) { + add_one = 1; + } else { + /* round to even */ + add_one = + get_bit(r->tab, l, l * LIMB_BITS - 1 - (prec - 1)); + } + } + break; + case BF_RNDD: + case BF_RNDU: + if (r->sign == (rnd_mode == BF_RNDD)) + add_one = inexact; + break; + case BF_RNDA: + add_one = inexact; + break; + case BF_RNDNA: + case BF_RNDF: + add_one = bit1; + break; + default: + abort(); + } + + if (inexact) + *pret |= BF_ST_INEXACT; + return add_one; +} + +static int bf_set_overflow(bf_t *r, int sign, limb_t prec, bf_flags_t flags) +{ + slimb_t i, l, e_max; + int rnd_mode; + + rnd_mode = flags & BF_RND_MASK; + if (prec == BF_PREC_INF || + rnd_mode == BF_RNDN || + rnd_mode == BF_RNDNA || + rnd_mode == BF_RNDA || + (rnd_mode == BF_RNDD && sign == 1) || + (rnd_mode == BF_RNDU && sign == 0)) { + bf_set_inf(r, sign); + } else { + /* set to maximum finite number */ + l = (prec + LIMB_BITS - 1) / LIMB_BITS; + if (bf_resize(r, l)) { + bf_set_nan(r); + return BF_ST_MEM_ERROR; + } + r->tab[0] = limb_mask((-prec) & (LIMB_BITS - 1), + LIMB_BITS - 1); + for(i = 1; i < l; i++) + r->tab[i] = (limb_t)-1; + e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1); + r->expn = e_max; + r->sign = sign; + } + return BF_ST_OVERFLOW | BF_ST_INEXACT; +} + +/* round to prec1 bits assuming 'r' is non zero and finite. 'r' is + assumed to have length 'l' (1 <= l <= r->len). Note: 'prec1' can be + infinite (BF_PREC_INF). 'ret' is 0 or BF_ST_INEXACT if the result + is known to be inexact. Can fail with BF_ST_MEM_ERROR in case of + overflow not returning infinity. */ +static int __bf_round(bf_t *r, limb_t prec1, bf_flags_t flags, limb_t l, + int ret) +{ + limb_t v, a; + int shift, add_one, rnd_mode; + slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec; + + /* e_min and e_max are computed to match the IEEE 754 conventions */ + e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1); + e_min = -e_range + 3; + e_max = e_range; + + if (flags & BF_FLAG_RADPNT_PREC) { + /* 'prec' is the precision after the radix point */ + if (prec1 != BF_PREC_INF) + prec = r->expn + prec1; + else + prec = prec1; + } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) { + /* restrict the precision in case of potentially subnormal + result */ + assert(prec1 != BF_PREC_INF); + prec = prec1 - (e_min - r->expn); + } else { + prec = prec1; + } + + /* round to prec bits */ + rnd_mode = flags & BF_RND_MASK; + add_one = bf_get_rnd_add(&ret, r, l, prec, rnd_mode); + + if (prec <= 0) { + if (add_one) { + bf_resize(r, 1); /* cannot fail */ + r->tab[0] = (limb_t)1 << (LIMB_BITS - 1); + r->expn += 1 - prec; + ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; + return ret; + } else { + goto underflow; + } + } else if (add_one) { + limb_t carry; + + /* add one starting at digit 'prec - 1' */ + bit_pos = l * LIMB_BITS - 1 - (prec - 1); + pos = bit_pos >> LIMB_LOG2_BITS; + carry = (limb_t)1 << (bit_pos & (LIMB_BITS - 1)); + + for(i = pos; i < l; i++) { + v = r->tab[i] + carry; + carry = (v < carry); + r->tab[i] = v; + if (carry == 0) + break; + } + if (carry) { + /* shift right by one digit */ + v = 1; + for(i = l - 1; i >= pos; i--) { + a = r->tab[i]; + r->tab[i] = (a >> 1) | (v << (LIMB_BITS - 1)); + v = a; + } + r->expn++; + } + } + + /* check underflow */ + if (unlikely(r->expn < e_min)) { + if (flags & BF_FLAG_SUBNORMAL) { + /* if inexact, also set the underflow flag */ + if (ret & BF_ST_INEXACT) + ret |= BF_ST_UNDERFLOW; + } else { + underflow: + ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; + bf_set_zero(r, r->sign); + return ret; + } + } + + /* check overflow */ + if (unlikely(r->expn > e_max)) + return bf_set_overflow(r, r->sign, prec1, flags); + + /* keep the bits starting at 'prec - 1' */ + bit_pos = l * LIMB_BITS - 1 - (prec - 1); + i = bit_pos >> LIMB_LOG2_BITS; + if (i >= 0) { + shift = bit_pos & (LIMB_BITS - 1); + if (shift != 0) + r->tab[i] &= limb_mask(shift, LIMB_BITS - 1); + } else { + i = 0; + } + /* remove trailing zeros */ + while (r->tab[i] == 0) + i++; + if (i > 0) { + l -= i; + memmove(r->tab, r->tab + i, l * sizeof(limb_t)); + } + bf_resize(r, l); /* cannot fail */ + return ret; +} + +/* 'r' must be a finite number. */ +int bf_normalize_and_round(bf_t *r, limb_t prec1, bf_flags_t flags) +{ + limb_t l, v, a; + int shift, ret; + slimb_t i; + + // bf_print_str("bf_renorm", r); + l = r->len; + while (l > 0 && r->tab[l - 1] == 0) + l--; + if (l == 0) { + /* zero */ + r->expn = BF_EXP_ZERO; + bf_resize(r, 0); /* cannot fail */ + ret = 0; + } else { + r->expn -= (r->len - l) * LIMB_BITS; + /* shift to have the MSB set to '1' */ + v = r->tab[l - 1]; + shift = clz(v); + if (shift != 0) { + v = 0; + for(i = 0; i < l; i++) { + a = r->tab[i]; + r->tab[i] = (a << shift) | (v >> (LIMB_BITS - shift)); + v = a; + } + r->expn -= shift; + } + ret = __bf_round(r, prec1, flags, l, 0); + } + // bf_print_str("r_final", r); + return ret; +} + +/* return true if rounding can be done at precision 'prec' assuming + the exact result r is such that |r-a| <= 2^(EXP(a)-k). */ +/* XXX: check the case where the exponent would be incremented by the + rounding */ +int bf_can_round(const bf_t *a, slimb_t prec, bf_rnd_t rnd_mode, slimb_t k) +{ + BOOL is_rndn; + slimb_t bit_pos, n; + limb_t bit; + + if (a->expn == BF_EXP_INF || a->expn == BF_EXP_NAN) + return FALSE; + if (rnd_mode == BF_RNDF) { + return (k >= (prec + 1)); + } + if (a->expn == BF_EXP_ZERO) + return FALSE; + is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); + if (k < (prec + 2)) + return FALSE; + bit_pos = a->len * LIMB_BITS - 1 - prec; + n = k - prec; + /* bit pattern for RNDN or RNDNA: 0111.. or 1000... + for other rounding modes: 000... or 111... + */ + bit = get_bit(a->tab, a->len, bit_pos); + bit_pos--; + n--; + bit ^= is_rndn; + /* XXX: slow, but a few iterations on average */ + while (n != 0) { + if (get_bit(a->tab, a->len, bit_pos) != bit) + return TRUE; + bit_pos--; + n--; + } + return FALSE; +} + +/* Cannot fail with BF_ST_MEM_ERROR. */ +int bf_round(bf_t *r, limb_t prec, bf_flags_t flags) +{ + if (r->len == 0) + return 0; + return __bf_round(r, prec, flags, r->len, 0); +} + +/* for debugging */ +static __maybe_unused void dump_limbs(const char *str, const limb_t *tab, limb_t n) +{ + limb_t i; + printf("%s: len=%" PRId_LIMB "\n", str, n); + for(i = 0; i < n; i++) { + printf("%" PRId_LIMB ": " FMT_LIMB "\n", + i, tab[i]); + } +} + +void mp_print_str(const char *str, const limb_t *tab, limb_t n) +{ + slimb_t i; + printf("%s= 0x", str); + for(i = n - 1; i >= 0; i--) { + if (i != (n - 1)) + printf("_"); + printf(FMT_LIMB, tab[i]); + } + printf("\n"); +} + +static __maybe_unused void mp_print_str_h(const char *str, + const limb_t *tab, limb_t n, + limb_t high) +{ + slimb_t i; + printf("%s= 0x", str); + printf(FMT_LIMB, high); + for(i = n - 1; i >= 0; i--) { + printf("_"); + printf(FMT_LIMB, tab[i]); + } + printf("\n"); +} + +/* for debugging */ +void bf_print_str(const char *str, const bf_t *a) +{ + slimb_t i; + printf("%s=", str); + + if (a->expn == BF_EXP_NAN) { + printf("NaN"); + } else { + if (a->sign) + putchar('-'); + if (a->expn == BF_EXP_ZERO) { + putchar('0'); + } else if (a->expn == BF_EXP_INF) { + printf("Inf"); + } else { + printf("0x0."); + for(i = a->len - 1; i >= 0; i--) + printf(FMT_LIMB, a->tab[i]); + printf("p%" PRId_LIMB, a->expn); + } + } + printf("\n"); +} + +/* compare the absolute value of 'a' and 'b'. Return < 0 if a < b, 0 + if a = b and > 0 otherwise. */ +int bf_cmpu(const bf_t *a, const bf_t *b) +{ + slimb_t i; + limb_t len, v1, v2; + + if (a->expn != b->expn) { + if (a->expn < b->expn) + return -1; + else + return 1; + } + len = bf_max(a->len, b->len); + for(i = len - 1; i >= 0; i--) { + v1 = get_limbz(a, a->len - len + i); + v2 = get_limbz(b, b->len - len + i); + if (v1 != v2) { + if (v1 < v2) + return -1; + else + return 1; + } + } + return 0; +} + +/* Full order: -0 < 0, NaN == NaN and NaN is larger than all other numbers */ +int bf_cmp_full(const bf_t *a, const bf_t *b) +{ + int res; + + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + if (a->expn == b->expn) + res = 0; + else if (a->expn == BF_EXP_NAN) + res = 1; + else + res = -1; + } else if (a->sign != b->sign) { + res = 1 - 2 * a->sign; + } else { + res = bf_cmpu(a, b); + if (a->sign) + res = -res; + } + return res; +} + +/* Standard floating point comparison: return 2 if one of the operands + is NaN (unordered) or -1, 0, 1 depending on the ordering assuming + -0 == +0 */ +int bf_cmp(const bf_t *a, const bf_t *b) +{ + int res; + + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + res = 2; + } else if (a->sign != b->sign) { + if (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_ZERO) + res = 0; + else + res = 1 - 2 * a->sign; + } else { + res = bf_cmpu(a, b); + if (a->sign) + res = -res; + } + return res; +} + +/* Compute the number of bits 'n' matching the pattern: + a= X1000..0 + b= X0111..1 + + When computing a-b, the result will have at least n leading zero + bits. + + Precondition: a > b and a.expn - b.expn = 0 or 1 +*/ +static limb_t count_cancelled_bits(const bf_t *a, const bf_t *b) +{ + slimb_t bit_offset, b_offset, n; + int p, p1; + limb_t v1, v2, mask; + + bit_offset = a->len * LIMB_BITS - 1; + b_offset = (b->len - a->len) * LIMB_BITS - (LIMB_BITS - 1) + + a->expn - b->expn; + n = 0; + + /* first search the equals bits */ + for(;;) { + v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS); + v2 = get_bits(b->tab, b->len, bit_offset + b_offset); + // printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2); + if (v1 != v2) + break; + n += LIMB_BITS; + bit_offset -= LIMB_BITS; + } + /* find the position of the first different bit */ + p = clz(v1 ^ v2) + 1; + n += p; + /* then search for '0' in a and '1' in b */ + p = LIMB_BITS - p; + if (p > 0) { + /* search in the trailing p bits of v1 and v2 */ + mask = limb_mask(0, p - 1); + p1 = bf_min(clz(v1 & mask), clz((~v2) & mask)) - (LIMB_BITS - p); + n += p1; + if (p1 != p) + goto done; + } + bit_offset -= LIMB_BITS; + for(;;) { + v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS); + v2 = get_bits(b->tab, b->len, bit_offset + b_offset); + // printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2); + if (v1 != 0 || v2 != -1) { + /* different: count the matching bits */ + p1 = bf_min(clz(v1), clz(~v2)); + n += p1; + break; + } + n += LIMB_BITS; + bit_offset -= LIMB_BITS; + } + done: + return n; +} + +static int bf_add_internal(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags, int b_neg) +{ + const bf_t *tmp; + int is_sub, ret, cmp_res, a_sign, b_sign; + + a_sign = a->sign; + b_sign = b->sign ^ b_neg; + is_sub = a_sign ^ b_sign; + cmp_res = bf_cmpu(a, b); + if (cmp_res < 0) { + tmp = a; + a = b; + b = tmp; + a_sign = b_sign; /* b_sign is never used later */ + } + /* abs(a) >= abs(b) */ + if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) { + /* zero result */ + bf_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD); + ret = 0; + } else if (a->len == 0 || b->len == 0) { + ret = 0; + if (a->expn >= BF_EXP_INF) { + if (a->expn == BF_EXP_NAN) { + /* at least one operand is NaN */ + bf_set_nan(r); + } else if (b->expn == BF_EXP_INF && is_sub) { + /* infinities with different signs */ + bf_set_nan(r); + ret = BF_ST_INVALID_OP; + } else { + bf_set_inf(r, a_sign); + } + } else { + /* at least one zero and not subtract */ + bf_set(r, a); + r->sign = a_sign; + goto renorm; + } + } else { + slimb_t d, a_offset, b_bit_offset, i, cancelled_bits; + limb_t carry, v1, v2, u, r_len, carry1, precl, tot_len, z, sub_mask; + + r->sign = a_sign; + r->expn = a->expn; + d = a->expn - b->expn; + /* must add more precision for the leading cancelled bits in + subtraction */ + if (is_sub) { + if (d <= 1) + cancelled_bits = count_cancelled_bits(a, b); + else + cancelled_bits = 1; + } else { + cancelled_bits = 0; + } + + /* add two extra bits for rounding */ + precl = (cancelled_bits + prec + 2 + LIMB_BITS - 1) / LIMB_BITS; + tot_len = bf_max(a->len, b->len + (d + LIMB_BITS - 1) / LIMB_BITS); + r_len = bf_min(precl, tot_len); + if (bf_resize(r, r_len)) + goto fail; + a_offset = a->len - r_len; + b_bit_offset = (b->len - r_len) * LIMB_BITS + d; + + /* compute the bits before for the rounding */ + carry = is_sub; + z = 0; + sub_mask = -is_sub; + i = r_len - tot_len; + while (i < 0) { + slimb_t ap, bp; + BOOL inflag; + + ap = a_offset + i; + bp = b_bit_offset + i * LIMB_BITS; + inflag = FALSE; + if (ap >= 0 && ap < a->len) { + v1 = a->tab[ap]; + inflag = TRUE; + } else { + v1 = 0; + } + if (bp + LIMB_BITS > 0 && bp < (slimb_t)(b->len * LIMB_BITS)) { + v2 = get_bits(b->tab, b->len, bp); + inflag = TRUE; + } else { + v2 = 0; + } + if (!inflag) { + /* outside 'a' and 'b': go directly to the next value + inside a or b so that the running time does not + depend on the exponent difference */ + i = 0; + if (ap < 0) + i = bf_min(i, -a_offset); + /* b_bit_offset + i * LIMB_BITS + LIMB_BITS >= 1 + equivalent to + i >= ceil(-b_bit_offset + 1 - LIMB_BITS) / LIMB_BITS) + */ + if (bp + LIMB_BITS <= 0) + i = bf_min(i, (-b_bit_offset) >> LIMB_LOG2_BITS); + } else { + i++; + } + v2 ^= sub_mask; + u = v1 + v2; + carry1 = u < v1; + u += carry; + carry = (u < carry) | carry1; + z |= u; + } + /* and the result */ + for(i = 0; i < r_len; i++) { + v1 = get_limbz(a, a_offset + i); + v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS); + v2 ^= sub_mask; + u = v1 + v2; + carry1 = u < v1; + u += carry; + carry = (u < carry) | carry1; + r->tab[i] = u; + } + /* set the extra bits for the rounding */ + r->tab[0] |= (z != 0); + + /* carry is only possible in add case */ + if (!is_sub && carry) { + if (bf_resize(r, r_len + 1)) + goto fail; + r->tab[r_len] = 1; + r->expn += LIMB_BITS; + } + renorm: + ret = bf_normalize_and_round(r, prec, flags); + } + return ret; + fail: + bf_set_nan(r); + return BF_ST_MEM_ERROR; +} + +static int __bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_add_internal(r, a, b, prec, flags, 0); +} + +static int __bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_add_internal(r, a, b, prec, flags, 1); +} + +limb_t mp_add(limb_t *res, const limb_t *op1, const limb_t *op2, + limb_t n, limb_t carry) +{ + slimb_t i; + limb_t k, a, v, k1; + + k = carry; + for(i=0;i<n;i++) { + v = op1[i]; + a = v + op2[i]; + k1 = a < v; + a = a + k; + k = (a < k) | k1; + res[i] = a; + } + return k; +} + +limb_t mp_add_ui(limb_t *tab, limb_t b, size_t n) +{ + size_t i; + limb_t k, a; + + k=b; + for(i=0;i<n;i++) { + if (k == 0) + break; + a = tab[i] + k; + k = (a < k); + tab[i] = a; + } + return k; +} + +limb_t mp_sub(limb_t *res, const limb_t *op1, const limb_t *op2, + mp_size_t n, limb_t carry) +{ + int i; + limb_t k, a, v, k1; + + k = carry; + for(i=0;i<n;i++) { + v = op1[i]; + a = v - op2[i]; + k1 = a > v; + v = a - k; + k = (v > a) | k1; + res[i] = v; + } + return k; +} + +/* compute 0 - op2 */ +static limb_t mp_neg(limb_t *res, const limb_t *op2, mp_size_t n, limb_t carry) +{ + int i; + limb_t k, a, v, k1; + + k = carry; + for(i=0;i<n;i++) { + v = 0; + a = v - op2[i]; + k1 = a > v; + v = a - k; + k = (v > a) | k1; + res[i] = v; + } + return k; +} + +limb_t mp_sub_ui(limb_t *tab, limb_t b, mp_size_t n) +{ + mp_size_t i; + limb_t k, a, v; + + k=b; + for(i=0;i<n;i++) { + v = tab[i]; + a = v - k; + k = a > v; + tab[i] = a; + if (k == 0) + break; + } + return k; +} + +/* r = (a + high*B^n) >> shift. Return the remainder r (0 <= r < 2^shift). + 1 <= shift <= LIMB_BITS - 1 */ +static limb_t mp_shr(limb_t *tab_r, const limb_t *tab, mp_size_t n, + int shift, limb_t high) +{ + mp_size_t i; + limb_t l, a; + + assert(shift >= 1 && shift < LIMB_BITS); + l = high; + for(i = n - 1; i >= 0; i--) { + a = tab[i]; + tab_r[i] = (a >> shift) | (l << (LIMB_BITS - shift)); + l = a; + } + return l & (((limb_t)1 << shift) - 1); +} + +/* tabr[] = taba[] * b + l. Return the high carry */ +static limb_t mp_mul1(limb_t *tabr, const limb_t *taba, limb_t n, + limb_t b, limb_t l) +{ + limb_t i; + dlimb_t t; + + for(i = 0; i < n; i++) { + t = (dlimb_t)taba[i] * (dlimb_t)b + l; + tabr[i] = t; + l = t >> LIMB_BITS; + } + return l; +} + +/* tabr[] += taba[] * b, return the high word. */ +static limb_t mp_add_mul1(limb_t *tabr, const limb_t *taba, limb_t n, + limb_t b) +{ + limb_t i, l; + dlimb_t t; + + l = 0; + for(i = 0; i < n; i++) { + t = (dlimb_t)taba[i] * (dlimb_t)b + l + tabr[i]; + tabr[i] = t; + l = t >> LIMB_BITS; + } + return l; +} + +/* size of the result : op1_size + op2_size. */ +static void mp_mul_basecase(limb_t *result, + const limb_t *op1, limb_t op1_size, + const limb_t *op2, limb_t op2_size) +{ + limb_t i, r; + + result[op1_size] = mp_mul1(result, op1, op1_size, op2[0], 0); + for(i=1;i<op2_size;i++) { + r = mp_add_mul1(result + i, op1, op1_size, op2[i]); + result[i + op1_size] = r; + } +} + +/* return 0 if OK, -1 if memory error */ +/* XXX: change API so that result can be allocated */ +int mp_mul(bf_context_t *s, limb_t *result, + const limb_t *op1, limb_t op1_size, + const limb_t *op2, limb_t op2_size) +{ +#ifdef USE_FFT_MUL + if (unlikely(bf_min(op1_size, op2_size) >= FFT_MUL_THRESHOLD)) { + bf_t r_s, *r = &r_s; + r->tab = result; + /* XXX: optimize memory usage in API */ + if (fft_mul(s, r, (limb_t *)op1, op1_size, + (limb_t *)op2, op2_size, FFT_MUL_R_NORESIZE)) + return -1; + } else +#endif + { + mp_mul_basecase(result, op1, op1_size, op2, op2_size); + } + return 0; +} + +/* tabr[] -= taba[] * b. Return the value to substract to the high + word. */ +static limb_t mp_sub_mul1(limb_t *tabr, const limb_t *taba, limb_t n, + limb_t b) +{ + limb_t i, l; + dlimb_t t; + + l = 0; + for(i = 0; i < n; i++) { + t = tabr[i] - (dlimb_t)taba[i] * (dlimb_t)b - l; + tabr[i] = t; + l = -(t >> LIMB_BITS); + } + return l; +} + +/* WARNING: d must be >= 2^(LIMB_BITS-1) */ +static inline limb_t udiv1norm_init(limb_t d) +{ + limb_t a0, a1; + a1 = -d - 1; + a0 = -1; + return (((dlimb_t)a1 << LIMB_BITS) | a0) / d; +} + +/* return the quotient and the remainder in '*pr'of 'a1*2^LIMB_BITS+a0 + / d' with 0 <= a1 < d. */ +static inline limb_t udiv1norm(limb_t *pr, limb_t a1, limb_t a0, + limb_t d, limb_t d_inv) +{ + limb_t n1m, n_adj, q, r, ah; + dlimb_t a; + n1m = ((slimb_t)a0 >> (LIMB_BITS - 1)); + n_adj = a0 + (n1m & d); + a = (dlimb_t)d_inv * (a1 - n1m) + n_adj; + q = (a >> LIMB_BITS) + a1; + /* compute a - q * r and update q so that the remainder is\ + between 0 and d - 1 */ + a = ((dlimb_t)a1 << LIMB_BITS) | a0; + a = a - (dlimb_t)q * d - d; + ah = a >> LIMB_BITS; + q += 1 + ah; + r = (limb_t)a + (ah & d); + *pr = r; + return q; +} + +/* b must be >= 1 << (LIMB_BITS - 1) */ +static limb_t mp_div1norm(limb_t *tabr, const limb_t *taba, limb_t n, + limb_t b, limb_t r) +{ + slimb_t i; + + if (n >= UDIV1NORM_THRESHOLD) { + limb_t b_inv; + b_inv = udiv1norm_init(b); + for(i = n - 1; i >= 0; i--) { + tabr[i] = udiv1norm(&r, r, taba[i], b, b_inv); + } + } else { + dlimb_t a1; + for(i = n - 1; i >= 0; i--) { + a1 = ((dlimb_t)r << LIMB_BITS) | taba[i]; + tabr[i] = a1 / b; + r = a1 % b; + } + } + return r; +} + +static int mp_divnorm_large(bf_context_t *s, + limb_t *tabq, limb_t *taba, limb_t na, + const limb_t *tabb, limb_t nb); + +/* base case division: divides taba[0..na-1] by tabb[0..nb-1]. tabb[nb + - 1] must be >= 1 << (LIMB_BITS - 1). na - nb must be >= 0. 'taba' + is modified and contains the remainder (nb limbs). tabq[0..na-nb] + contains the quotient with tabq[na - nb] <= 1. */ +static int mp_divnorm(bf_context_t *s, limb_t *tabq, limb_t *taba, limb_t na, + const limb_t *tabb, limb_t nb) +{ + limb_t r, a, c, q, v, b1, b1_inv, n, dummy_r; + slimb_t i, j; + + b1 = tabb[nb - 1]; + if (nb == 1) { + taba[0] = mp_div1norm(tabq, taba, na, b1, 0); + return 0; + } + n = na - nb; + if (bf_min(n, nb) >= DIVNORM_LARGE_THRESHOLD) { + return mp_divnorm_large(s, tabq, taba, na, tabb, nb); + } + + if (n >= UDIV1NORM_THRESHOLD) + b1_inv = udiv1norm_init(b1); + else + b1_inv = 0; + + /* first iteration: the quotient is only 0 or 1 */ + q = 1; + for(j = nb - 1; j >= 0; j--) { + if (taba[n + j] != tabb[j]) { + if (taba[n + j] < tabb[j]) + q = 0; + break; + } + } + tabq[n] = q; + if (q) { + mp_sub(taba + n, taba + n, tabb, nb, 0); + } + + for(i = n - 1; i >= 0; i--) { + if (unlikely(taba[i + nb] >= b1)) { + q = -1; + } else if (b1_inv) { + q = udiv1norm(&dummy_r, taba[i + nb], taba[i + nb - 1], b1, b1_inv); + } else { + dlimb_t al; + al = ((dlimb_t)taba[i + nb] << LIMB_BITS) | taba[i + nb - 1]; + q = al / b1; + r = al % b1; + } + r = mp_sub_mul1(taba + i, tabb, nb, q); + + v = taba[i + nb]; + a = v - r; + c = (a > v); + taba[i + nb] = a; + + if (c != 0) { + /* negative result */ + for(;;) { + q--; + c = mp_add(taba + i, taba + i, tabb, nb, 0); + /* propagate carry and test if positive result */ + if (c != 0) { + if (++taba[i + nb] == 0) { + break; + } + } + } + } + tabq[i] = q; + } + return 0; +} + +/* compute r=B^(2*n)/a such as a*r < B^(2*n) < a*r + 2 with n >= 1. 'a' + has n limbs with a[n-1] >= B/2 and 'r' has n+1 limbs with r[n] = 1. + + See Modern Computer Arithmetic by Richard P. Brent and Paul + Zimmermann, algorithm 3.5 */ +int mp_recip(bf_context_t *s, limb_t *tabr, const limb_t *taba, limb_t n) +{ + mp_size_t l, h, k, i; + limb_t *tabxh, *tabt, c, *tabu; + + if (n <= 2) { + /* return ceil(B^(2*n)/a) - 1 */ + /* XXX: could avoid allocation */ + tabu = bf_malloc(s, sizeof(limb_t) * (2 * n + 1)); + tabt = bf_malloc(s, sizeof(limb_t) * (n + 2)); + if (!tabt || !tabu) + goto fail; + for(i = 0; i < 2 * n; i++) + tabu[i] = 0; + tabu[2 * n] = 1; + if (mp_divnorm(s, tabt, tabu, 2 * n + 1, taba, n)) + goto fail; + for(i = 0; i < n + 1; i++) + tabr[i] = tabt[i]; + if (mp_scan_nz(tabu, n) == 0) { + /* only happens for a=B^n/2 */ + mp_sub_ui(tabr, 1, n + 1); + } + } else { + l = (n - 1) / 2; + h = n - l; + /* n=2p -> l=p-1, h = p + 1, k = p + 3 + n=2p+1-> l=p, h = p + 1; k = p + 2 + */ + tabt = bf_malloc(s, sizeof(limb_t) * (n + h + 1)); + tabu = bf_malloc(s, sizeof(limb_t) * (n + 2 * h - l + 2)); + if (!tabt || !tabu) + goto fail; + tabxh = tabr + l; + if (mp_recip(s, tabxh, taba + l, h)) + goto fail; + if (mp_mul(s, tabt, taba, n, tabxh, h + 1)) /* n + h + 1 limbs */ + goto fail; + while (tabt[n + h] != 0) { + mp_sub_ui(tabxh, 1, h + 1); + c = mp_sub(tabt, tabt, taba, n, 0); + mp_sub_ui(tabt + n, c, h + 1); + } + /* T = B^(n+h) - T */ + mp_neg(tabt, tabt, n + h + 1, 0); + tabt[n + h]++; + if (mp_mul(s, tabu, tabt + l, n + h + 1 - l, tabxh, h + 1)) + goto fail; + /* n + 2*h - l + 2 limbs */ + k = 2 * h - l; + for(i = 0; i < l; i++) + tabr[i] = tabu[i + k]; + mp_add(tabr + l, tabr + l, tabu + 2 * h, h, 0); + } + bf_free(s, tabt); + bf_free(s, tabu); + return 0; + fail: + bf_free(s, tabt); + bf_free(s, tabu); + return -1; +} + +/* return -1, 0 or 1 */ +static int mp_cmp(const limb_t *taba, const limb_t *tabb, mp_size_t n) +{ + mp_size_t i; + for(i = n - 1; i >= 0; i--) { + if (taba[i] != tabb[i]) { + if (taba[i] < tabb[i]) + return -1; + else + return 1; + } + } + return 0; +} + +//#define DEBUG_DIVNORM_LARGE +//#define DEBUG_DIVNORM_LARGE2 + +/* subquadratic divnorm */ +static int mp_divnorm_large(bf_context_t *s, + limb_t *tabq, limb_t *taba, limb_t na, + const limb_t *tabb, limb_t nb) +{ + limb_t *tabb_inv, nq, *tabt, i, n; + nq = na - nb; +#ifdef DEBUG_DIVNORM_LARGE + printf("na=%d nb=%d nq=%d\n", (int)na, (int)nb, (int)nq); + mp_print_str("a", taba, na); + mp_print_str("b", tabb, nb); +#endif + assert(nq >= 1); + n = nq; + if (nq < nb) + n++; + tabb_inv = bf_malloc(s, sizeof(limb_t) * (n + 1)); + tabt = bf_malloc(s, sizeof(limb_t) * 2 * (n + 1)); + if (!tabb_inv || !tabt) + goto fail; + + if (n >= nb) { + for(i = 0; i < n - nb; i++) + tabt[i] = 0; + for(i = 0; i < nb; i++) + tabt[i + n - nb] = tabb[i]; + } else { + /* truncate B: need to increment it so that the approximate + inverse is smaller that the exact inverse */ + for(i = 0; i < n; i++) + tabt[i] = tabb[i + nb - n]; + if (mp_add_ui(tabt, 1, n)) { + /* tabt = B^n : tabb_inv = B^n */ + memset(tabb_inv, 0, n * sizeof(limb_t)); + tabb_inv[n] = 1; + goto recip_done; + } + } + if (mp_recip(s, tabb_inv, tabt, n)) + goto fail; + recip_done: + /* Q=A*B^-1 */ + if (mp_mul(s, tabt, tabb_inv, n + 1, taba + na - (n + 1), n + 1)) + goto fail; + + for(i = 0; i < nq + 1; i++) + tabq[i] = tabt[i + 2 * (n + 1) - (nq + 1)]; +#ifdef DEBUG_DIVNORM_LARGE + mp_print_str("q", tabq, nq + 1); +#endif + + bf_free(s, tabt); + bf_free(s, tabb_inv); + tabb_inv = NULL; + + /* R=A-B*Q */ + tabt = bf_malloc(s, sizeof(limb_t) * (na + 1)); + if (!tabt) + goto fail; + if (mp_mul(s, tabt, tabq, nq + 1, tabb, nb)) + goto fail; + /* we add one more limb for the result */ + mp_sub(taba, taba, tabt, nb + 1, 0); + bf_free(s, tabt); + /* the approximated quotient is smaller than than the exact one, + hence we may have to increment it */ +#ifdef DEBUG_DIVNORM_LARGE2 + int cnt = 0; + static int cnt_max; +#endif + for(;;) { + if (taba[nb] == 0 && mp_cmp(taba, tabb, nb) < 0) + break; + taba[nb] -= mp_sub(taba, taba, tabb, nb, 0); + mp_add_ui(tabq, 1, nq + 1); +#ifdef DEBUG_DIVNORM_LARGE2 + cnt++; +#endif + } +#ifdef DEBUG_DIVNORM_LARGE2 + if (cnt > cnt_max) { + cnt_max = cnt; + printf("\ncnt=%d nq=%d nb=%d\n", cnt_max, (int)nq, (int)nb); + } +#endif + return 0; + fail: + bf_free(s, tabb_inv); + bf_free(s, tabt); + return -1; +} + +int bf_mul(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags) +{ + int ret, r_sign; + + if (a->len < b->len) { + const bf_t *tmp = a; + a = b; + b = tmp; + } + r_sign = a->sign ^ b->sign; + /* here b->len <= a->len */ + if (b->len == 0) { + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + bf_set_nan(r); + ret = 0; + } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) { + if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) || + (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) { + bf_set_nan(r); + ret = BF_ST_INVALID_OP; + } else { + bf_set_inf(r, r_sign); + ret = 0; + } + } else { + bf_set_zero(r, r_sign); + ret = 0; + } + } else { + bf_t tmp, *r1 = NULL; + limb_t a_len, b_len, precl; + limb_t *a_tab, *b_tab; + + a_len = a->len; + b_len = b->len; + + if ((flags & BF_RND_MASK) == BF_RNDF) { + /* faithful rounding does not require using the full inputs */ + precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS; + a_len = bf_min(a_len, precl); + b_len = bf_min(b_len, precl); + } + a_tab = a->tab + a->len - a_len; + b_tab = b->tab + b->len - b_len; + +#ifdef USE_FFT_MUL + if (b_len >= FFT_MUL_THRESHOLD) { + int mul_flags = 0; + if (r == a) + mul_flags |= FFT_MUL_R_OVERLAP_A; + if (r == b) + mul_flags |= FFT_MUL_R_OVERLAP_B; + if (fft_mul(r->ctx, r, a_tab, a_len, b_tab, b_len, mul_flags)) + goto fail; + } else +#endif + { + if (r == a || r == b) { + bf_init(r->ctx, &tmp); + r1 = r; + r = &tmp; + } + if (bf_resize(r, a_len + b_len)) { + fail: + bf_set_nan(r); + ret = BF_ST_MEM_ERROR; + goto done; + } + mp_mul_basecase(r->tab, a_tab, a_len, b_tab, b_len); + } + r->sign = r_sign; + r->expn = a->expn + b->expn; + ret = bf_normalize_and_round(r, prec, flags); + done: + if (r == &tmp) + bf_move(r1, &tmp); + } + return ret; +} + +/* multiply 'r' by 2^e */ +int bf_mul_2exp(bf_t *r, slimb_t e, limb_t prec, bf_flags_t flags) +{ + slimb_t e_max; + if (r->len == 0) + return 0; + e_max = ((limb_t)1 << BF_EXT_EXP_BITS_MAX) - 1; + e = bf_max(e, -e_max); + e = bf_min(e, e_max); + r->expn += e; + return __bf_round(r, prec, flags, r->len, 0); +} + +/* Return e such as a=m*2^e with m odd integer. return 0 if a is zero, + Infinite or Nan. */ +slimb_t bf_get_exp_min(const bf_t *a) +{ + slimb_t i; + limb_t v; + int k; + + for(i = 0; i < a->len; i++) { + v = a->tab[i]; + if (v != 0) { + k = ctz(v); + return a->expn - (a->len - i) * LIMB_BITS + k; + } + } + return 0; +} + +/* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the + integer defined as floor(a/b) and r = a - q * b. */ +static void bf_tdivremu(bf_t *q, bf_t *r, + const bf_t *a, const bf_t *b) +{ + if (bf_cmpu(a, b) < 0) { + bf_set_ui(q, 0); + bf_set(r, a); + } else { + bf_div(q, a, b, bf_max(a->expn - b->expn + 1, 2), BF_RNDZ); + bf_rint(q, BF_RNDZ); + bf_mul(r, q, b, BF_PREC_INF, BF_RNDZ); + bf_sub(r, a, r, BF_PREC_INF, BF_RNDZ); + } +} + +static int __bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags) +{ + bf_context_t *s = r->ctx; + int ret, r_sign; + limb_t n, nb, precl; + + r_sign = a->sign ^ b->sign; + if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) { + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else if (a->expn == BF_EXP_INF) { + bf_set_inf(r, r_sign); + return 0; + } else { + bf_set_zero(r, r_sign); + return 0; + } + } else if (a->expn == BF_EXP_ZERO) { + if (b->expn == BF_EXP_ZERO) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_set_zero(r, r_sign); + return 0; + } + } else if (b->expn == BF_EXP_ZERO) { + bf_set_inf(r, r_sign); + return BF_ST_DIVIDE_ZERO; + } + + /* number of limbs of the quotient (2 extra bits for rounding) */ + precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS; + nb = b->len; + n = bf_max(a->len, precl); + + { + limb_t *taba, na; + slimb_t d; + + na = n + nb; + taba = bf_malloc(s, (na + 1) * sizeof(limb_t)); + if (!taba) + goto fail; + d = na - a->len; + memset(taba, 0, d * sizeof(limb_t)); + memcpy(taba + d, a->tab, a->len * sizeof(limb_t)); + if (bf_resize(r, n + 1)) + goto fail1; + if (mp_divnorm(s, r->tab, taba, na, b->tab, nb)) { + fail1: + bf_free(s, taba); + goto fail; + } + /* see if non zero remainder */ + if (mp_scan_nz(taba, nb)) + r->tab[0] |= 1; + bf_free(r->ctx, taba); + r->expn = a->expn - b->expn + LIMB_BITS; + r->sign = r_sign; + ret = bf_normalize_and_round(r, prec, flags); + } + return ret; + fail: + bf_set_nan(r); + return BF_ST_MEM_ERROR; +} + +/* division and remainder. + + rnd_mode is the rounding mode for the quotient. The additional + rounding mode BF_RND_EUCLIDIAN is supported. + + 'q' is an integer. 'r' is rounded with prec and flags (prec can be + BF_PREC_INF). +*/ +int bf_divrem(bf_t *q, bf_t *r, const bf_t *a, const bf_t *b, + limb_t prec, bf_flags_t flags, int rnd_mode) +{ + bf_t a1_s, *a1 = &a1_s; + bf_t b1_s, *b1 = &b1_s; + int q_sign, ret; + BOOL is_ceil, is_rndn; + + assert(q != a && q != b); + assert(r != a && r != b); + assert(q != r); + + if (a->len == 0 || b->len == 0) { + bf_set_zero(q, 0); + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_set(r, a); + return bf_round(r, prec, flags); + } + } + + q_sign = a->sign ^ b->sign; + is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); + switch(rnd_mode) { + default: + case BF_RNDZ: + case BF_RNDN: + case BF_RNDNA: + is_ceil = FALSE; + break; + case BF_RNDD: + is_ceil = q_sign; + break; + case BF_RNDU: + is_ceil = q_sign ^ 1; + break; + case BF_RNDA: + is_ceil = TRUE; + break; + case BF_DIVREM_EUCLIDIAN: + is_ceil = a->sign; + break; + } + + a1->expn = a->expn; + a1->tab = a->tab; + a1->len = a->len; + a1->sign = 0; + + b1->expn = b->expn; + b1->tab = b->tab; + b1->len = b->len; + b1->sign = 0; + + /* XXX: could improve to avoid having a large 'q' */ + bf_tdivremu(q, r, a1, b1); + if (bf_is_nan(q) || bf_is_nan(r)) + goto fail; + + if (r->len != 0) { + if (is_rndn) { + int res; + b1->expn--; + res = bf_cmpu(r, b1); + b1->expn++; + if (res > 0 || + (res == 0 && + (rnd_mode == BF_RNDNA || + get_bit(q->tab, q->len, q->len * LIMB_BITS - q->expn)))) { + goto do_sub_r; + } + } else if (is_ceil) { + do_sub_r: + ret = bf_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ); + ret |= bf_sub(r, r, b1, BF_PREC_INF, BF_RNDZ); + if (ret & BF_ST_MEM_ERROR) + goto fail; + } + } + + r->sign ^= a->sign; + q->sign = q_sign; + return bf_round(r, prec, flags); + fail: + bf_set_nan(q); + bf_set_nan(r); + return BF_ST_MEM_ERROR; +} + +int bf_rem(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags, int rnd_mode) +{ + bf_t q_s, *q = &q_s; + int ret; + + bf_init(r->ctx, q); + ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode); + bf_delete(q); + return ret; +} + +static inline int bf_get_limb(slimb_t *pres, const bf_t *a, int flags) +{ +#if LIMB_BITS == 32 + return bf_get_int32(pres, a, flags); +#else + return bf_get_int64(pres, a, flags); +#endif +} + +int bf_remquo(slimb_t *pq, bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags, int rnd_mode) +{ + bf_t q_s, *q = &q_s; + int ret; + + bf_init(r->ctx, q); + ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode); + bf_get_limb(pq, q, BF_GET_INT_MOD); + bf_delete(q); + return ret; +} + +static __maybe_unused inline limb_t mul_mod(limb_t a, limb_t b, limb_t m) +{ + dlimb_t t; + t = (dlimb_t)a * (dlimb_t)b; + return t % m; +} + +#if defined(USE_MUL_CHECK) +static limb_t mp_mod1(const limb_t *tab, limb_t n, limb_t m, limb_t r) +{ + slimb_t i; + dlimb_t t; + + for(i = n - 1; i >= 0; i--) { + t = ((dlimb_t)r << LIMB_BITS) | tab[i]; + r = t % m; + } + return r; +} +#endif + +static const uint16_t sqrt_table[192] = { +128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221,221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230,230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238,239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255, +}; + +/* a >= 2^(LIMB_BITS - 2). Return (s, r) with s=floor(sqrt(a)) and + r=a-s^2. 0 <= r <= 2 * s */ +static limb_t mp_sqrtrem1(limb_t *pr, limb_t a) +{ + limb_t s1, r1, s, r, q, u, num; + + /* use a table for the 16 -> 8 bit sqrt */ + s1 = sqrt_table[(a >> (LIMB_BITS - 8)) - 64]; + r1 = (a >> (LIMB_BITS - 16)) - s1 * s1; + if (r1 > 2 * s1) { + r1 -= 2 * s1 + 1; + s1++; + } + + /* one iteration to get a 32 -> 16 bit sqrt */ + num = (r1 << 8) | ((a >> (LIMB_BITS - 32 + 8)) & 0xff); + q = num / (2 * s1); /* q <= 2^8 */ + u = num % (2 * s1); + s = (s1 << 8) + q; + r = (u << 8) | ((a >> (LIMB_BITS - 32)) & 0xff); + r -= q * q; + if ((slimb_t)r < 0) { + s--; + r += 2 * s + 1; + } + +#if LIMB_BITS == 64 + s1 = s; + r1 = r; + /* one more iteration for 64 -> 32 bit sqrt */ + num = (r1 << 16) | ((a >> (LIMB_BITS - 64 + 16)) & 0xffff); + q = num / (2 * s1); /* q <= 2^16 */ + u = num % (2 * s1); + s = (s1 << 16) + q; + r = (u << 16) | ((a >> (LIMB_BITS - 64)) & 0xffff); + r -= q * q; + if ((slimb_t)r < 0) { + s--; + r += 2 * s + 1; + } +#endif + *pr = r; + return s; +} + +/* return floor(sqrt(a)) */ +limb_t bf_isqrt(limb_t a) +{ + limb_t s, r; + int k; + + if (a == 0) + return 0; + k = clz(a) & ~1; + s = mp_sqrtrem1(&r, a << k); + s >>= (k >> 1); + return s; +} + +static limb_t mp_sqrtrem2(limb_t *tabs, limb_t *taba) +{ + limb_t s1, r1, s, q, u, a0, a1; + dlimb_t r, num; + int l; + + a0 = taba[0]; + a1 = taba[1]; + s1 = mp_sqrtrem1(&r1, a1); + l = LIMB_BITS / 2; + num = ((dlimb_t)r1 << l) | (a0 >> l); + q = num / (2 * s1); + u = num % (2 * s1); + s = (s1 << l) + q; + r = ((dlimb_t)u << l) | (a0 & (((limb_t)1 << l) - 1)); + if (unlikely((q >> l) != 0)) + r -= (dlimb_t)1 << LIMB_BITS; /* special case when q=2^l */ + else + r -= q * q; + if ((slimb_t)(r >> LIMB_BITS) < 0) { + s--; + r += 2 * (dlimb_t)s + 1; + } + tabs[0] = s; + taba[0] = r; + return r >> LIMB_BITS; +} + +//#define DEBUG_SQRTREM + +/* tmp_buf must contain (n / 2 + 1 limbs). *prh contains the highest + limb of the remainder. */ +static int mp_sqrtrem_rec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n, + limb_t *tmp_buf, limb_t *prh) +{ + limb_t l, h, rh, ql, qh, c, i; + + if (n == 1) { + *prh = mp_sqrtrem2(tabs, taba); + return 0; + } +#ifdef DEBUG_SQRTREM + mp_print_str("a", taba, 2 * n); +#endif + l = n / 2; + h = n - l; + if (mp_sqrtrem_rec(s, tabs + l, taba + 2 * l, h, tmp_buf, &qh)) + return -1; +#ifdef DEBUG_SQRTREM + mp_print_str("s1", tabs + l, h); + mp_print_str_h("r1", taba + 2 * l, h, qh); + mp_print_str_h("r2", taba + l, n, qh); +#endif + + /* the remainder is in taba + 2 * l. Its high bit is in qh */ + if (qh) { + mp_sub(taba + 2 * l, taba + 2 * l, tabs + l, h, 0); + } + /* instead of dividing by 2*s, divide by s (which is normalized) + and update q and r */ + if (mp_divnorm(s, tmp_buf, taba + l, n, tabs + l, h)) + return -1; + qh += tmp_buf[l]; + for(i = 0; i < l; i++) + tabs[i] = tmp_buf[i]; + ql = mp_shr(tabs, tabs, l, 1, qh & 1); + qh = qh >> 1; /* 0 or 1 */ + if (ql) + rh = mp_add(taba + l, taba + l, tabs + l, h, 0); + else + rh = 0; +#ifdef DEBUG_SQRTREM + mp_print_str_h("q", tabs, l, qh); + mp_print_str_h("u", taba + l, h, rh); +#endif + + mp_add_ui(tabs + l, qh, h); +#ifdef DEBUG_SQRTREM + mp_print_str_h("s2", tabs, n, sh); +#endif + + /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */ + /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */ + if (qh) { + c = qh; + } else { + if (mp_mul(s, taba + n, tabs, l, tabs, l)) + return -1; + c = mp_sub(taba, taba, taba + n, 2 * l, 0); + } + rh -= mp_sub_ui(taba + 2 * l, c, n - 2 * l); + if ((slimb_t)rh < 0) { + mp_sub_ui(tabs, 1, n); + rh += mp_add_mul1(taba, tabs, n, 2); + rh += mp_add_ui(taba, 1, n); + } + *prh = rh; + return 0; +} + +/* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= 2 ^ (LIMB_BITS + - 2). Return (s, r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2 + * s. tabs has n limbs. r is returned in the lower n limbs of + taba. Its r[n] is the returned value of the function. */ +/* Algorithm from the article "Karatsuba Square Root" by Paul Zimmermann and + inspirated from its GMP implementation */ +int mp_sqrtrem(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n) +{ + limb_t tmp_buf1[8]; + limb_t *tmp_buf; + mp_size_t n2; + int ret; + n2 = n / 2 + 1; + if (n2 <= countof(tmp_buf1)) { + tmp_buf = tmp_buf1; + } else { + tmp_buf = bf_malloc(s, sizeof(limb_t) * n2); + if (!tmp_buf) + return -1; + } + ret = mp_sqrtrem_rec(s, tabs, taba, n, tmp_buf, taba + n); + if (tmp_buf != tmp_buf1) + bf_free(s, tmp_buf); + return ret; +} + +/* Integer square root with remainder. 'a' must be an integer. r = + floor(sqrt(a)) and rem = a - r^2. BF_ST_INEXACT is set if the result + is inexact. 'rem' can be NULL if the remainder is not needed. */ +int bf_sqrtrem(bf_t *r, bf_t *rem1, const bf_t *a) +{ + int ret; + + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + } else if (a->expn == BF_EXP_INF && a->sign) { + goto invalid_op; + } else { + bf_set(r, a); + } + if (rem1) + bf_set_ui(rem1, 0); + ret = 0; + } else if (a->sign) { + invalid_op: + bf_set_nan(r); + if (rem1) + bf_set_ui(rem1, 0); + ret = BF_ST_INVALID_OP; + } else { + bf_t rem_s, *rem; + + bf_sqrt(r, a, (a->expn + 1) / 2, BF_RNDZ); + bf_rint(r, BF_RNDZ); + /* see if the result is exact by computing the remainder */ + if (rem1) { + rem = rem1; + } else { + rem = &rem_s; + bf_init(r->ctx, rem); + } + /* XXX: could avoid recomputing the remainder */ + bf_mul(rem, r, r, BF_PREC_INF, BF_RNDZ); + bf_neg(rem); + bf_add(rem, rem, a, BF_PREC_INF, BF_RNDZ); + if (bf_is_nan(rem)) { + ret = BF_ST_MEM_ERROR; + goto done; + } + if (rem->len != 0) { + ret = BF_ST_INEXACT; + } else { + ret = 0; + } + done: + if (!rem1) + bf_delete(rem); + } + return ret; +} + +int bf_sqrt(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = a->ctx; + int ret; + + assert(r != a); + + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + } else if (a->expn == BF_EXP_INF && a->sign) { + goto invalid_op; + } else { + bf_set(r, a); + } + ret = 0; + } else if (a->sign) { + invalid_op: + bf_set_nan(r); + ret = BF_ST_INVALID_OP; + } else { + limb_t *a1; + slimb_t n, n1; + limb_t res; + + /* convert the mantissa to an integer with at least 2 * + prec + 4 bits */ + n = (2 * (prec + 2) + 2 * LIMB_BITS - 1) / (2 * LIMB_BITS); + if (bf_resize(r, n)) + goto fail; + a1 = bf_malloc(s, sizeof(limb_t) * 2 * n); + if (!a1) + goto fail; + n1 = bf_min(2 * n, a->len); + memset(a1, 0, (2 * n - n1) * sizeof(limb_t)); + memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t)); + if (a->expn & 1) { + res = mp_shr(a1, a1, 2 * n, 1, 0); + } else { + res = 0; + } + if (mp_sqrtrem(s, r->tab, a1, n)) { + bf_free(s, a1); + goto fail; + } + if (!res) { + res = mp_scan_nz(a1, n + 1); + } + bf_free(s, a1); + if (!res) { + res = mp_scan_nz(a->tab, a->len - n1); + } + if (res != 0) + r->tab[0] |= 1; + r->sign = 0; + r->expn = (a->expn + 1) >> 1; + ret = bf_round(r, prec, flags); + } + return ret; + fail: + bf_set_nan(r); + return BF_ST_MEM_ERROR; +} + +static no_inline int bf_op2(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags, bf_op2_func_t *func) +{ + bf_t tmp; + int ret; + + if (r == a || r == b) { + bf_init(r->ctx, &tmp); + ret = func(&tmp, a, b, prec, flags); + bf_move(r, &tmp); + } else { + ret = func(r, a, b, prec, flags); + } + return ret; +} + +int bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_op2(r, a, b, prec, flags, __bf_add); +} + +int bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_op2(r, a, b, prec, flags, __bf_sub); +} + +int bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_op2(r, a, b, prec, flags, __bf_div); +} + +int bf_mul_ui(bf_t *r, const bf_t *a, uint64_t b1, limb_t prec, + bf_flags_t flags) +{ + bf_t b; + int ret; + bf_init(r->ctx, &b); + ret = bf_set_ui(&b, b1); + ret |= bf_mul(r, a, &b, prec, flags); + bf_delete(&b); + return ret; +} + +int bf_mul_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec, + bf_flags_t flags) +{ + bf_t b; + int ret; + bf_init(r->ctx, &b); + ret = bf_set_si(&b, b1); + ret |= bf_mul(r, a, &b, prec, flags); + bf_delete(&b); + return ret; +} + +int bf_add_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec, + bf_flags_t flags) +{ + bf_t b; + int ret; + + bf_init(r->ctx, &b); + ret = bf_set_si(&b, b1); + ret |= bf_add(r, a, &b, prec, flags); + bf_delete(&b); + return ret; +} + +static int bf_pow_ui(bf_t *r, const bf_t *a, limb_t b, limb_t prec, + bf_flags_t flags) +{ + int ret, n_bits, i; + + assert(r != a); + if (b == 0) + return bf_set_ui(r, 1); + ret = bf_set(r, a); + n_bits = LIMB_BITS - clz(b); + for(i = n_bits - 2; i >= 0; i--) { + ret |= bf_mul(r, r, r, prec, flags); + if ((b >> i) & 1) + ret |= bf_mul(r, r, a, prec, flags); + } + return ret; +} + +static int bf_pow_ui_ui(bf_t *r, limb_t a1, limb_t b, + limb_t prec, bf_flags_t flags) +{ + bf_t a; + int ret; + + if (a1 == 10 && b <= LIMB_DIGITS) { + /* use precomputed powers. We do not round at this point + because we expect the caller to do it */ + ret = bf_set_ui(r, mp_pow_dec[b]); + } else { + bf_init(r->ctx, &a); + ret = bf_set_ui(&a, a1); + ret |= bf_pow_ui(r, &a, b, prec, flags); + bf_delete(&a); + } + return ret; +} + +/* convert to integer (infinite precision) */ +int bf_rint(bf_t *r, int rnd_mode) +{ + return bf_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC); +} + +/* logical operations */ +#define BF_LOGIC_OR 0 +#define BF_LOGIC_XOR 1 +#define BF_LOGIC_AND 2 + +static inline limb_t bf_logic_op1(limb_t a, limb_t b, int op) +{ + switch(op) { + case BF_LOGIC_OR: + return a | b; + case BF_LOGIC_XOR: + return a ^ b; + default: + case BF_LOGIC_AND: + return a & b; + } +} + +static int bf_logic_op(bf_t *r, const bf_t *a1, const bf_t *b1, int op) +{ + bf_t b1_s, a1_s, *a, *b; + limb_t a_sign, b_sign, r_sign; + slimb_t l, i, a_bit_offset, b_bit_offset; + limb_t v1, v2, v1_mask, v2_mask, r_mask; + int ret; + + assert(r != a1 && r != b1); + + if (a1->expn <= 0) + a_sign = 0; /* minus zero is considered as positive */ + else + a_sign = a1->sign; + + if (b1->expn <= 0) + b_sign = 0; /* minus zero is considered as positive */ + else + b_sign = b1->sign; + + if (a_sign) { + a = &a1_s; + bf_init(r->ctx, a); + if (bf_add_si(a, a1, 1, BF_PREC_INF, BF_RNDZ)) { + b = NULL; + goto fail; + } + } else { + a = (bf_t *)a1; + } + + if (b_sign) { + b = &b1_s; + bf_init(r->ctx, b); + if (bf_add_si(b, b1, 1, BF_PREC_INF, BF_RNDZ)) + goto fail; + } else { + b = (bf_t *)b1; + } + + r_sign = bf_logic_op1(a_sign, b_sign, op); + if (op == BF_LOGIC_AND && r_sign == 0) { + /* no need to compute extra zeros for and */ + if (a_sign == 0 && b_sign == 0) + l = bf_min(a->expn, b->expn); + else if (a_sign == 0) + l = a->expn; + else + l = b->expn; + } else { + l = bf_max(a->expn, b->expn); + } + /* Note: a or b can be zero */ + l = (bf_max(l, 1) + LIMB_BITS - 1) / LIMB_BITS; + if (bf_resize(r, l)) + goto fail; + a_bit_offset = a->len * LIMB_BITS - a->expn; + b_bit_offset = b->len * LIMB_BITS - b->expn; + v1_mask = -a_sign; + v2_mask = -b_sign; + r_mask = -r_sign; + for(i = 0; i < l; i++) { + v1 = get_bits(a->tab, a->len, a_bit_offset + i * LIMB_BITS) ^ v1_mask; + v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS) ^ v2_mask; + r->tab[i] = bf_logic_op1(v1, v2, op) ^ r_mask; + } + r->expn = l * LIMB_BITS; + r->sign = r_sign; + bf_normalize_and_round(r, BF_PREC_INF, BF_RNDZ); /* cannot fail */ + if (r_sign) { + if (bf_add_si(r, r, -1, BF_PREC_INF, BF_RNDZ)) + goto fail; + } + ret = 0; + done: + if (a == &a1_s) + bf_delete(a); + if (b == &b1_s) + bf_delete(b); + return ret; + fail: + bf_set_nan(r); + ret = BF_ST_MEM_ERROR; + goto done; +} + +/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ +int bf_logic_or(bf_t *r, const bf_t *a, const bf_t *b) +{ + return bf_logic_op(r, a, b, BF_LOGIC_OR); +} + +/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ +int bf_logic_xor(bf_t *r, const bf_t *a, const bf_t *b) +{ + return bf_logic_op(r, a, b, BF_LOGIC_XOR); +} + +/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ +int bf_logic_and(bf_t *r, const bf_t *a, const bf_t *b) +{ + return bf_logic_op(r, a, b, BF_LOGIC_AND); +} + +/* conversion between fixed size types */ + +typedef union { + double d; + uint64_t u; +} Float64Union; + +int bf_get_float64(const bf_t *a, double *pres, bf_rnd_t rnd_mode) +{ + Float64Union u; + int e, ret; + uint64_t m; + + ret = 0; + if (a->expn == BF_EXP_NAN) { + u.u = 0x7ff8000000000000; /* quiet nan */ + } else { + bf_t b_s, *b = &b_s; + + bf_init(a->ctx, b); + bf_set(b, a); + if (bf_is_finite(b)) { + ret = bf_round(b, 53, rnd_mode | BF_FLAG_SUBNORMAL | bf_set_exp_bits(11)); + } + if (b->expn == BF_EXP_INF) { + e = (1 << 11) - 1; + m = 0; + } else if (b->expn == BF_EXP_ZERO) { + e = 0; + m = 0; + } else { + e = b->expn + 1023 - 1; +#if LIMB_BITS == 32 + if (b->len == 2) { + m = ((uint64_t)b->tab[1] << 32) | b->tab[0]; + } else { + m = ((uint64_t)b->tab[0] << 32); + } +#else + m = b->tab[0]; +#endif + if (e <= 0) { + /* subnormal */ + m = m >> (12 - e); + e = 0; + } else { + m = (m << 1) >> 12; + } + } + u.u = m | ((uint64_t)e << 52) | ((uint64_t)b->sign << 63); + bf_delete(b); + } + *pres = u.d; + return ret; +} + +int bf_set_float64(bf_t *a, double d) +{ + Float64Union u; + uint64_t m; + int shift, e, sgn; + + u.d = d; + sgn = u.u >> 63; + e = (u.u >> 52) & ((1 << 11) - 1); + m = u.u & (((uint64_t)1 << 52) - 1); + if (e == ((1 << 11) - 1)) { + if (m != 0) { + bf_set_nan(a); + } else { + bf_set_inf(a, sgn); + } + } else if (e == 0) { + if (m == 0) { + bf_set_zero(a, sgn); + } else { + /* subnormal number */ + m <<= 12; + shift = clz64(m); + m <<= shift; + e = -shift; + goto norm; + } + } else { + m = (m << 11) | ((uint64_t)1 << 63); + norm: + a->expn = e - 1023 + 1; +#if LIMB_BITS == 32 + if (bf_resize(a, 2)) + goto fail; + a->tab[0] = m; + a->tab[1] = m >> 32; +#else + if (bf_resize(a, 1)) + goto fail; + a->tab[0] = m; +#endif + a->sign = sgn; + } + return 0; +fail: + bf_set_nan(a); + return BF_ST_MEM_ERROR; +} + +/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there + is an overflow and 0 otherwise. */ +int bf_get_int32(int *pres, const bf_t *a, int flags) +{ + uint32_t v; + int ret; + if (a->expn >= BF_EXP_INF) { + ret = BF_ST_INVALID_OP; + if (flags & BF_GET_INT_MOD) { + v = 0; + } else if (a->expn == BF_EXP_INF) { + v = (uint32_t)INT32_MAX + a->sign; + } else { + v = INT32_MAX; + } + } else if (a->expn <= 0) { + v = 0; + ret = 0; + } else if (a->expn <= 31) { + v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); + if (a->sign) + v = -v; + ret = 0; + } else if (!(flags & BF_GET_INT_MOD)) { + ret = BF_ST_INVALID_OP; + if (a->sign) { + v = (uint32_t)INT32_MAX + 1; + if (a->expn == 32 && + (a->tab[a->len - 1] >> (LIMB_BITS - 32)) == v) { + ret = 0; + } + } else { + v = INT32_MAX; + } + } else { + v = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn); + if (a->sign) + v = -v; + ret = 0; + } + *pres = v; + return ret; +} + +/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there + is an overflow and 0 otherwise. */ +int bf_get_int64(int64_t *pres, const bf_t *a, int flags) +{ + uint64_t v; + int ret; + if (a->expn >= BF_EXP_INF) { + ret = BF_ST_INVALID_OP; + if (flags & BF_GET_INT_MOD) { + v = 0; + } else if (a->expn == BF_EXP_INF) { + v = (uint64_t)INT64_MAX + a->sign; + } else { + v = INT64_MAX; + } + } else if (a->expn <= 0) { + v = 0; + ret = 0; + } else if (a->expn <= 63) { +#if LIMB_BITS == 32 + if (a->expn <= 32) + v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); + else + v = (((uint64_t)a->tab[a->len - 1] << 32) | + get_limbz(a, a->len - 2)) >> (64 - a->expn); +#else + v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); +#endif + if (a->sign) + v = -v; + ret = 0; + } else if (!(flags & BF_GET_INT_MOD)) { + ret = BF_ST_INVALID_OP; + if (a->sign) { + uint64_t v1; + v = (uint64_t)INT64_MAX + 1; + if (a->expn == 64) { + v1 = a->tab[a->len - 1]; +#if LIMB_BITS == 32 + v1 = (v1 << 32) | get_limbz(a, a->len - 2); +#endif + if (v1 == v) + ret = 0; + } + } else { + v = INT64_MAX; + } + } else { + slimb_t bit_pos = a->len * LIMB_BITS - a->expn; + v = get_bits(a->tab, a->len, bit_pos); +#if LIMB_BITS == 32 + v |= (uint64_t)get_bits(a->tab, a->len, bit_pos + 32) << 32; +#endif + if (a->sign) + v = -v; + ret = 0; + } + *pres = v; + return ret; +} + +/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there + is an overflow and 0 otherwise. */ +int bf_get_uint64(uint64_t *pres, const bf_t *a) +{ + uint64_t v; + int ret; + if (a->expn == BF_EXP_NAN) { + goto overflow; + } else if (a->expn <= 0) { + v = 0; + ret = 0; + } else if (a->sign) { + v = 0; + ret = BF_ST_INVALID_OP; + } else if (a->expn <= 64) { +#if LIMB_BITS == 32 + if (a->expn <= 32) + v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); + else + v = (((uint64_t)a->tab[a->len - 1] << 32) | + get_limbz(a, a->len - 2)) >> (64 - a->expn); +#else + v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); +#endif + ret = 0; + } else { + overflow: + v = UINT64_MAX; + ret = BF_ST_INVALID_OP; + } + *pres = v; + return ret; +} + +/* base conversion from radix */ + +static const uint8_t digits_per_limb_table[BF_RADIX_MAX - 1] = { +#if LIMB_BITS == 32 +32,20,16,13,12,11,10,10, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, +#else +64,40,32,27,24,22,21,20,19,18,17,17,16,16,16,15,15,15,14,14,14,14,13,13,13,13,13,13,13,12,12,12,12,12,12, +#endif +}; + +static limb_t get_limb_radix(int radix) +{ + int i, k; + limb_t radixl; + + k = digits_per_limb_table[radix - 2]; + radixl = radix; + for(i = 1; i < k; i++) + radixl *= radix; + return radixl; +} + +/* return != 0 if error */ +static int bf_integer_from_radix_rec(bf_t *r, const limb_t *tab, + limb_t n, int level, limb_t n0, + limb_t radix, bf_t *pow_tab) +{ + int ret; + if (n == 1) { + ret = bf_set_ui(r, tab[0]); + } else { + bf_t T_s, *T = &T_s, *B; + limb_t n1, n2; + + n2 = (((n0 * 2) >> (level + 1)) + 1) / 2; + n1 = n - n2; + // printf("level=%d n0=%ld n1=%ld n2=%ld\n", level, n0, n1, n2); + B = &pow_tab[level]; + if (B->len == 0) { + ret = bf_pow_ui_ui(B, radix, n2, BF_PREC_INF, BF_RNDZ); + if (ret) + return ret; + } + ret = bf_integer_from_radix_rec(r, tab + n2, n1, level + 1, n0, + radix, pow_tab); + if (ret) + return ret; + ret = bf_mul(r, r, B, BF_PREC_INF, BF_RNDZ); + if (ret) + return ret; + bf_init(r->ctx, T); + ret = bf_integer_from_radix_rec(T, tab, n2, level + 1, n0, + radix, pow_tab); + if (!ret) + ret = bf_add(r, r, T, BF_PREC_INF, BF_RNDZ); + bf_delete(T); + } + return ret; + // bf_print_str(" r=", r); +} + +/* return 0 if OK != 0 if memory error */ +static int bf_integer_from_radix(bf_t *r, const limb_t *tab, + limb_t n, limb_t radix) +{ + bf_context_t *s = r->ctx; + int pow_tab_len, i, ret; + limb_t radixl; + bf_t *pow_tab; + + radixl = get_limb_radix(radix); + pow_tab_len = ceil_log2(n) + 2; /* XXX: check */ + pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len); + if (!pow_tab) + return -1; + for(i = 0; i < pow_tab_len; i++) + bf_init(r->ctx, &pow_tab[i]); + ret = bf_integer_from_radix_rec(r, tab, n, 0, n, radixl, pow_tab); + for(i = 0; i < pow_tab_len; i++) { + bf_delete(&pow_tab[i]); + } + bf_free(s, pow_tab); + return ret; +} + +/* compute and round T * radix^expn. */ +int bf_mul_pow_radix(bf_t *r, const bf_t *T, limb_t radix, + slimb_t expn, limb_t prec, bf_flags_t flags) +{ + int ret, expn_sign, overflow; + slimb_t e, extra_bits, prec1, ziv_extra_bits; + bf_t B_s, *B = &B_s; + + if (T->len == 0) { + return bf_set(r, T); + } else if (expn == 0) { + ret = bf_set(r, T); + ret |= bf_round(r, prec, flags); + return ret; + } + + e = expn; + expn_sign = 0; + if (e < 0) { + e = -e; + expn_sign = 1; + } + bf_init(r->ctx, B); + if (prec == BF_PREC_INF) { + /* infinite precision: only used if the result is known to be exact */ + ret = bf_pow_ui_ui(B, radix, e, BF_PREC_INF, BF_RNDN); + if (expn_sign) { + ret |= bf_div(r, T, B, T->len * LIMB_BITS, BF_RNDN); + } else { + ret |= bf_mul(r, T, B, BF_PREC_INF, BF_RNDN); + } + } else { + ziv_extra_bits = 16; + for(;;) { + prec1 = prec + ziv_extra_bits; + /* XXX: correct overflow/underflow handling */ + /* XXX: rigorous error analysis needed */ + extra_bits = ceil_log2(e) * 2 + 1; + ret = bf_pow_ui_ui(B, radix, e, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP); + overflow = !bf_is_finite(B); + /* XXX: if bf_pow_ui_ui returns an exact result, can stop + after the next operation */ + if (expn_sign) + ret |= bf_div(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP); + else + ret |= bf_mul(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP); + if (ret & BF_ST_MEM_ERROR) + break; + if ((ret & BF_ST_INEXACT) && + !bf_can_round(r, prec, flags & BF_RND_MASK, prec1) && + !overflow) { + /* and more precision and retry */ + ziv_extra_bits = ziv_extra_bits + (ziv_extra_bits / 2); + } else { + /* XXX: need to use __bf_round() to pass the inexact + flag for the subnormal case */ + ret = bf_round(r, prec, flags) | (ret & BF_ST_INEXACT); + break; + } + } + } + bf_delete(B); + return ret; +} + +static inline int to_digit(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'Z') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'z') + return c - 'a' + 10; + else + return 36; +} + +/* add a limb at 'pos' and decrement pos. new space is created if + needed. Return 0 if OK, -1 if memory error */ +static int bf_add_limb(bf_t *a, slimb_t *ppos, limb_t v) +{ + slimb_t pos; + pos = *ppos; + if (unlikely(pos < 0)) { + limb_t new_size, d, *new_tab; + new_size = bf_max(a->len + 1, a->len * 3 / 2); + new_tab = bf_realloc(a->ctx, a->tab, sizeof(limb_t) * new_size); + if (!new_tab) + return -1; + a->tab = new_tab; + d = new_size - a->len; + memmove(a->tab + d, a->tab, a->len * sizeof(limb_t)); + a->len = new_size; + pos += d; + } + a->tab[pos--] = v; + *ppos = pos; + return 0; +} + +static int bf_tolower(int c) +{ + if (c >= 'A' && c <= 'Z') + c = c - 'A' + 'a'; + return c; +} + +static int strcasestart(const char *str, const char *val, const char **ptr) +{ + const char *p, *q; + p = str; + q = val; + while (*q != '\0') { + if (bf_tolower(*p) != *q) + return 0; + p++; + q++; + } + if (ptr) + *ptr = p; + return 1; +} + +static int bf_atof_internal(bf_t *r, slimb_t *pexponent, + const char *str, const char **pnext, int radix, + limb_t prec, bf_flags_t flags, BOOL is_dec) +{ + const char *p, *p_start; + int is_neg, radix_bits, exp_is_neg, ret, digits_per_limb, shift; + limb_t cur_limb; + slimb_t pos, expn, int_len, digit_count; + BOOL has_decpt, is_bin_exp; + bf_t a_s, *a; + + *pexponent = 0; + p = str; + if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 && + strcasestart(p, "nan", &p)) { + bf_set_nan(r); + ret = 0; + goto done; + } + is_neg = 0; + + if (p[0] == '+') { + p++; + p_start = p; + } else if (p[0] == '-') { + is_neg = 1; + p++; + p_start = p; + } else { + p_start = p; + } + if (p[0] == '0') { + if ((p[1] == 'x' || p[1] == 'X') && + (radix == 0 || radix == 16) && + !(flags & BF_ATOF_NO_HEX)) { + radix = 16; + p += 2; + } else if ((p[1] == 'o' || p[1] == 'O') && + radix == 0 && (flags & BF_ATOF_BIN_OCT)) { + p += 2; + radix = 8; + } else if ((p[1] == 'b' || p[1] == 'B') && + radix == 0 && (flags & BF_ATOF_BIN_OCT)) { + p += 2; + radix = 2; + } else { + goto no_prefix; + } + /* there must be a digit after the prefix */ + if (to_digit((uint8_t)*p) >= radix) { + bf_set_nan(r); + ret = 0; + goto done; + } + no_prefix: ; + } else { + if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 && + strcasestart(p, "inf", &p)) { + bf_set_inf(r, is_neg); + ret = 0; + goto done; + } + } + + if (radix == 0) + radix = 10; + if (is_dec) { + assert(radix == 10); + radix_bits = 0; + a = r; + } else if ((radix & (radix - 1)) != 0) { + radix_bits = 0; /* base is not a power of two */ + a = &a_s; + bf_init(r->ctx, a); + } else { + radix_bits = ceil_log2(radix); + a = r; + } + + /* skip leading zeros */ + /* XXX: could also skip zeros after the decimal point */ + while (*p == '0') + p++; + + if (radix_bits) { + shift = digits_per_limb = LIMB_BITS; + } else { + radix_bits = 0; + shift = digits_per_limb = digits_per_limb_table[radix - 2]; + } + cur_limb = 0; + bf_resize(a, 1); + pos = 0; + has_decpt = FALSE; + int_len = digit_count = 0; + for(;;) { + limb_t c; + if (*p == '.' && (p > p_start || to_digit(p[1]) < radix)) { + if (has_decpt) + break; + has_decpt = TRUE; + int_len = digit_count; + p++; + } + c = to_digit(*p); + if (c >= radix) + break; + digit_count++; + p++; + if (radix_bits) { + shift -= radix_bits; + if (shift <= 0) { + cur_limb |= c >> (-shift); + if (bf_add_limb(a, &pos, cur_limb)) + goto mem_error; + if (shift < 0) + cur_limb = c << (LIMB_BITS + shift); + else + cur_limb = 0; + shift += LIMB_BITS; + } else { + cur_limb |= c << shift; + } + } else { + cur_limb = cur_limb * radix + c; + shift--; + if (shift == 0) { + if (bf_add_limb(a, &pos, cur_limb)) + goto mem_error; + shift = digits_per_limb; + cur_limb = 0; + } + } + } + if (!has_decpt) + int_len = digit_count; + + /* add the last limb and pad with zeros */ + if (shift != digits_per_limb) { + if (radix_bits == 0) { + while (shift != 0) { + cur_limb *= radix; + shift--; + } + } + if (bf_add_limb(a, &pos, cur_limb)) { + mem_error: + ret = BF_ST_MEM_ERROR; + if (!radix_bits) + bf_delete(a); + bf_set_nan(r); + goto done; + } + } + + /* reset the next limbs to zero (we prefer to reallocate in the + renormalization) */ + memset(a->tab, 0, (pos + 1) * sizeof(limb_t)); + + if (p == p_start) { + ret = 0; + if (!radix_bits) + bf_delete(a); + bf_set_nan(r); + goto done; + } + + /* parse the exponent, if any */ + expn = 0; + is_bin_exp = FALSE; + if (((radix == 10 && (*p == 'e' || *p == 'E')) || + (radix != 10 && (*p == '@' || + (radix_bits && (*p == 'p' || *p == 'P'))))) && + p > p_start) { + is_bin_exp = (*p == 'p' || *p == 'P'); + p++; + exp_is_neg = 0; + if (*p == '+') { + p++; + } else if (*p == '-') { + exp_is_neg = 1; + p++; + } + for(;;) { + int c; + c = to_digit(*p); + if (c >= 10) + break; + if (unlikely(expn > ((BF_RAW_EXP_MAX - 2 - 9) / 10))) { + /* exponent overflow */ + if (exp_is_neg) { + bf_set_zero(r, is_neg); + ret = BF_ST_UNDERFLOW | BF_ST_INEXACT; + } else { + bf_set_inf(r, is_neg); + ret = BF_ST_OVERFLOW | BF_ST_INEXACT; + } + goto done; + } + p++; + expn = expn * 10 + c; + } + if (exp_is_neg) + expn = -expn; + } + if (is_dec) { + a->expn = expn + int_len; + a->sign = is_neg; + ret = bfdec_normalize_and_round((bfdec_t *)a, prec, flags); + } else if (radix_bits) { + /* XXX: may overflow */ + if (!is_bin_exp) + expn *= radix_bits; + a->expn = expn + (int_len * radix_bits); + a->sign = is_neg; + ret = bf_normalize_and_round(a, prec, flags); + } else { + limb_t l; + pos++; + l = a->len - pos; /* number of limbs */ + if (l == 0) { + bf_set_zero(r, is_neg); + ret = 0; + } else { + bf_t T_s, *T = &T_s; + + expn -= l * digits_per_limb - int_len; + bf_init(r->ctx, T); + if (bf_integer_from_radix(T, a->tab + pos, l, radix)) { + bf_set_nan(r); + ret = BF_ST_MEM_ERROR; + } else { + T->sign = is_neg; + if (flags & BF_ATOF_EXPONENT) { + /* return the exponent */ + *pexponent = expn; + ret = bf_set(r, T); + } else { + ret = bf_mul_pow_radix(r, T, radix, expn, prec, flags); + } + } + bf_delete(T); + } + bf_delete(a); + } + done: + if (pnext) + *pnext = p; + return ret; +} + +/* + Return (status, n, exp). 'status' is the floating point status. 'n' + is the parsed number. + + If (flags & BF_ATOF_EXPONENT) and if the radix is not a power of + two, the parsed number is equal to r * + (*pexponent)^radix. Otherwise *pexponent = 0. +*/ +int bf_atof2(bf_t *r, slimb_t *pexponent, + const char *str, const char **pnext, int radix, + limb_t prec, bf_flags_t flags) +{ + return bf_atof_internal(r, pexponent, str, pnext, radix, prec, flags, + FALSE); +} + +int bf_atof(bf_t *r, const char *str, const char **pnext, int radix, + limb_t prec, bf_flags_t flags) +{ + slimb_t dummy_exp; + return bf_atof_internal(r, &dummy_exp, str, pnext, radix, prec, flags, FALSE); +} + +/* base conversion to radix */ + +#if LIMB_BITS == 64 +#define RADIXL_10 UINT64_C(10000000000000000000) +#else +#define RADIXL_10 UINT64_C(1000000000) +#endif + +static const uint32_t inv_log2_radix[BF_RADIX_MAX - 1][LIMB_BITS / 32 + 1] = { +#if LIMB_BITS == 32 +{ 0x80000000, 0x00000000,}, +{ 0x50c24e60, 0xd4d4f4a7,}, +{ 0x40000000, 0x00000000,}, +{ 0x372068d2, 0x0a1ee5ca,}, +{ 0x3184648d, 0xb8153e7a,}, +{ 0x2d983275, 0x9d5369c4,}, +{ 0x2aaaaaaa, 0xaaaaaaab,}, +{ 0x28612730, 0x6a6a7a54,}, +{ 0x268826a1, 0x3ef3fde6,}, +{ 0x25001383, 0xbac8a744,}, +{ 0x23b46706, 0x82c0c709,}, +{ 0x229729f1, 0xb2c83ded,}, +{ 0x219e7ffd, 0xa5ad572b,}, +{ 0x20c33b88, 0xda7c29ab,}, +{ 0x20000000, 0x00000000,}, +{ 0x1f50b57e, 0xac5884b3,}, +{ 0x1eb22cc6, 0x8aa6e26f,}, +{ 0x1e21e118, 0x0c5daab2,}, +{ 0x1d9dcd21, 0x439834e4,}, +{ 0x1d244c78, 0x367a0d65,}, +{ 0x1cb40589, 0xac173e0c,}, +{ 0x1c4bd95b, 0xa8d72b0d,}, +{ 0x1bead768, 0x98f8ce4c,}, +{ 0x1b903469, 0x050f72e5,}, +{ 0x1b3b433f, 0x2eb06f15,}, +{ 0x1aeb6f75, 0x9c46fc38,}, +{ 0x1aa038eb, 0x0e3bfd17,}, +{ 0x1a593062, 0xb38d8c56,}, +{ 0x1a15f4c3, 0x2b95a2e6,}, +{ 0x19d630dc, 0xcc7ddef9,}, +{ 0x19999999, 0x9999999a,}, +{ 0x195fec80, 0x8a609431,}, +{ 0x1928ee7b, 0x0b4f22f9,}, +{ 0x18f46acf, 0x8c06e318,}, +{ 0x18c23246, 0xdc0a9f3d,}, +#else +{ 0x80000000, 0x00000000, 0x00000000,}, +{ 0x50c24e60, 0xd4d4f4a7, 0x021f57bc,}, +{ 0x40000000, 0x00000000, 0x00000000,}, +{ 0x372068d2, 0x0a1ee5ca, 0x19ea911b,}, +{ 0x3184648d, 0xb8153e7a, 0x7fc2d2e1,}, +{ 0x2d983275, 0x9d5369c4, 0x4dec1661,}, +{ 0x2aaaaaaa, 0xaaaaaaaa, 0xaaaaaaab,}, +{ 0x28612730, 0x6a6a7a53, 0x810fabde,}, +{ 0x268826a1, 0x3ef3fde6, 0x23e2566b,}, +{ 0x25001383, 0xbac8a744, 0x385a3349,}, +{ 0x23b46706, 0x82c0c709, 0x3f891718,}, +{ 0x229729f1, 0xb2c83ded, 0x15fba800,}, +{ 0x219e7ffd, 0xa5ad572a, 0xe169744b,}, +{ 0x20c33b88, 0xda7c29aa, 0x9bddee52,}, +{ 0x20000000, 0x00000000, 0x00000000,}, +{ 0x1f50b57e, 0xac5884b3, 0x70e28eee,}, +{ 0x1eb22cc6, 0x8aa6e26f, 0x06d1a2a2,}, +{ 0x1e21e118, 0x0c5daab1, 0x81b4f4bf,}, +{ 0x1d9dcd21, 0x439834e3, 0x81667575,}, +{ 0x1d244c78, 0x367a0d64, 0xc8204d6d,}, +{ 0x1cb40589, 0xac173e0c, 0x3b7b16ba,}, +{ 0x1c4bd95b, 0xa8d72b0d, 0x5879f25a,}, +{ 0x1bead768, 0x98f8ce4c, 0x66cc2858,}, +{ 0x1b903469, 0x050f72e5, 0x0cf5488e,}, +{ 0x1b3b433f, 0x2eb06f14, 0x8c89719c,}, +{ 0x1aeb6f75, 0x9c46fc37, 0xab5fc7e9,}, +{ 0x1aa038eb, 0x0e3bfd17, 0x1bd62080,}, +{ 0x1a593062, 0xb38d8c56, 0x7998ab45,}, +{ 0x1a15f4c3, 0x2b95a2e6, 0x46aed6a0,}, +{ 0x19d630dc, 0xcc7ddef9, 0x5aadd61b,}, +{ 0x19999999, 0x99999999, 0x9999999a,}, +{ 0x195fec80, 0x8a609430, 0xe1106014,}, +{ 0x1928ee7b, 0x0b4f22f9, 0x5f69791d,}, +{ 0x18f46acf, 0x8c06e318, 0x4d2aeb2c,}, +{ 0x18c23246, 0xdc0a9f3d, 0x3fe16970,}, +#endif +}; + +static const limb_t log2_radix[BF_RADIX_MAX - 1] = { +#if LIMB_BITS == 32 +0x20000000, +0x32b80347, +0x40000000, +0x4a4d3c26, +0x52b80347, +0x59d5d9fd, +0x60000000, +0x6570068e, +0x6a4d3c26, +0x6eb3a9f0, +0x72b80347, +0x766a008e, +0x79d5d9fd, +0x7d053f6d, +0x80000000, +0x82cc7edf, +0x8570068e, +0x87ef05ae, +0x8a4d3c26, +0x8c8ddd45, +0x8eb3a9f0, +0x90c10501, +0x92b80347, +0x949a784c, +0x966a008e, +0x982809d6, +0x99d5d9fd, +0x9b74948f, +0x9d053f6d, +0x9e88c6b3, +0xa0000000, +0xa16bad37, +0xa2cc7edf, +0xa4231623, +0xa570068e, +#else +0x2000000000000000, +0x32b803473f7ad0f4, +0x4000000000000000, +0x4a4d3c25e68dc57f, +0x52b803473f7ad0f4, +0x59d5d9fd5010b366, +0x6000000000000000, +0x6570068e7ef5a1e8, +0x6a4d3c25e68dc57f, +0x6eb3a9f01975077f, +0x72b803473f7ad0f4, +0x766a008e4788cbcd, +0x79d5d9fd5010b366, +0x7d053f6d26089673, +0x8000000000000000, +0x82cc7edf592262d0, +0x8570068e7ef5a1e8, +0x87ef05ae409a0289, +0x8a4d3c25e68dc57f, +0x8c8ddd448f8b845a, +0x8eb3a9f01975077f, +0x90c10500d63aa659, +0x92b803473f7ad0f4, +0x949a784bcd1b8afe, +0x966a008e4788cbcd, +0x982809d5be7072dc, +0x99d5d9fd5010b366, +0x9b74948f5532da4b, +0x9d053f6d26089673, +0x9e88c6b3626a72aa, +0xa000000000000000, +0xa16bad3758efd873, +0xa2cc7edf592262d0, +0xa4231623369e78e6, +0xa570068e7ef5a1e8, +#endif +}; + +/* compute floor(a*b) or ceil(a*b) with b = log2(radix) or + b=1/log2(radix). For is_inv = 0, strict accuracy is not guaranteed + when radix is not a power of two. */ +slimb_t bf_mul_log2_radix(slimb_t a1, unsigned int radix, int is_inv, + int is_ceil1) +{ + int is_neg; + limb_t a; + BOOL is_ceil; + + is_ceil = is_ceil1; + a = a1; + if (a1 < 0) { + a = -a; + is_neg = 1; + } else { + is_neg = 0; + } + is_ceil ^= is_neg; + if ((radix & (radix - 1)) == 0) { + int radix_bits; + /* radix is a power of two */ + radix_bits = ceil_log2(radix); + if (is_inv) { + if (is_ceil) + a += radix_bits - 1; + a = a / radix_bits; + } else { + a = a * radix_bits; + } + } else { + const uint32_t *tab; + limb_t b0, b1; + dlimb_t t; + + if (is_inv) { + tab = inv_log2_radix[radix - 2]; +#if LIMB_BITS == 32 + b1 = tab[0]; + b0 = tab[1]; +#else + b1 = ((limb_t)tab[0] << 32) | tab[1]; + b0 = (limb_t)tab[2] << 32; +#endif + t = (dlimb_t)b0 * (dlimb_t)a; + t = (dlimb_t)b1 * (dlimb_t)a + (t >> LIMB_BITS); + a = t >> (LIMB_BITS - 1); + } else { + b0 = log2_radix[radix - 2]; + t = (dlimb_t)b0 * (dlimb_t)a; + a = t >> (LIMB_BITS - 3); + } + /* a = floor(result) and 'result' cannot be an integer */ + a += is_ceil; + } + if (is_neg) + a = -a; + return a; +} + +/* 'n' is the number of output limbs */ +static int bf_integer_to_radix_rec(bf_t *pow_tab, + limb_t *out, const bf_t *a, limb_t n, + int level, limb_t n0, limb_t radixl, + unsigned int radixl_bits) +{ + limb_t n1, n2, q_prec; + int ret; + + assert(n >= 1); + if (n == 1) { + out[0] = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn); + } else if (n == 2) { + dlimb_t t; + slimb_t pos; + pos = a->len * LIMB_BITS - a->expn; + t = ((dlimb_t)get_bits(a->tab, a->len, pos + LIMB_BITS) << LIMB_BITS) | + get_bits(a->tab, a->len, pos); + if (likely(radixl == RADIXL_10)) { + /* use division by a constant when possible */ + out[0] = t % RADIXL_10; + out[1] = t / RADIXL_10; + } else { + out[0] = t % radixl; + out[1] = t / radixl; + } + } else { + bf_t Q, R, *B, *B_inv; + int q_add; + bf_init(a->ctx, &Q); + bf_init(a->ctx, &R); + n2 = (((n0 * 2) >> (level + 1)) + 1) / 2; + n1 = n - n2; + B = &pow_tab[2 * level]; + B_inv = &pow_tab[2 * level + 1]; + ret = 0; + if (B->len == 0) { + /* compute BASE^n2 */ + ret |= bf_pow_ui_ui(B, radixl, n2, BF_PREC_INF, BF_RNDZ); + /* we use enough bits for the maximum possible 'n1' value, + i.e. n2 + 1 */ + ret |= bf_set_ui(&R, 1); + ret |= bf_div(B_inv, &R, B, (n2 + 1) * radixl_bits + 2, BF_RNDN); + } + // printf("%d: n1=% " PRId64 " n2=%" PRId64 "\n", level, n1, n2); + q_prec = n1 * radixl_bits; + ret |= bf_mul(&Q, a, B_inv, q_prec, BF_RNDN); + ret |= bf_rint(&Q, BF_RNDZ); + + ret |= bf_mul(&R, &Q, B, BF_PREC_INF, BF_RNDZ); + ret |= bf_sub(&R, a, &R, BF_PREC_INF, BF_RNDZ); + + if (ret & BF_ST_MEM_ERROR) + goto fail; + /* adjust if necessary */ + q_add = 0; + while (R.sign && R.len != 0) { + if (bf_add(&R, &R, B, BF_PREC_INF, BF_RNDZ)) + goto fail; + q_add--; + } + while (bf_cmpu(&R, B) >= 0) { + if (bf_sub(&R, &R, B, BF_PREC_INF, BF_RNDZ)) + goto fail; + q_add++; + } + if (q_add != 0) { + if (bf_add_si(&Q, &Q, q_add, BF_PREC_INF, BF_RNDZ)) + goto fail; + } + if (bf_integer_to_radix_rec(pow_tab, out + n2, &Q, n1, level + 1, n0, + radixl, radixl_bits)) + goto fail; + if (bf_integer_to_radix_rec(pow_tab, out, &R, n2, level + 1, n0, + radixl, radixl_bits)) { + fail: + bf_delete(&Q); + bf_delete(&R); + return -1; + } + bf_delete(&Q); + bf_delete(&R); + } + return 0; +} + +/* return 0 if OK != 0 if memory error */ +static int bf_integer_to_radix(bf_t *r, const bf_t *a, limb_t radixl) +{ + bf_context_t *s = r->ctx; + limb_t r_len; + bf_t *pow_tab; + int i, pow_tab_len, ret; + + r_len = r->len; + pow_tab_len = (ceil_log2(r_len) + 2) * 2; /* XXX: check */ + pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len); + if (!pow_tab) + return -1; + for(i = 0; i < pow_tab_len; i++) + bf_init(r->ctx, &pow_tab[i]); + + ret = bf_integer_to_radix_rec(pow_tab, r->tab, a, r_len, 0, r_len, radixl, + ceil_log2(radixl)); + + for(i = 0; i < pow_tab_len; i++) { + bf_delete(&pow_tab[i]); + } + bf_free(s, pow_tab); + return ret; +} + +/* a must be >= 0. 'P' is the wanted number of digits in radix + 'radix'. 'r' is the mantissa represented as an integer. *pE + contains the exponent. Return != 0 if memory error. */ +static int bf_convert_to_radix(bf_t *r, slimb_t *pE, + const bf_t *a, int radix, + limb_t P, bf_rnd_t rnd_mode, + BOOL is_fixed_exponent) +{ + slimb_t E, e, prec, extra_bits, ziv_extra_bits, prec0; + bf_t B_s, *B = &B_s; + int e_sign, ret, res; + + if (a->len == 0) { + /* zero case */ + *pE = 0; + return bf_set(r, a); + } + + if (is_fixed_exponent) { + E = *pE; + } else { + /* compute the new exponent */ + E = 1 + bf_mul_log2_radix(a->expn - 1, radix, TRUE, FALSE); + } + // bf_print_str("a", a); + // printf("E=%ld P=%ld radix=%d\n", E, P, radix); + + for(;;) { + e = P - E; + e_sign = 0; + if (e < 0) { + e = -e; + e_sign = 1; + } + /* Note: precision for log2(radix) is not critical here */ + prec0 = bf_mul_log2_radix(P, radix, FALSE, TRUE); + ziv_extra_bits = 16; + for(;;) { + prec = prec0 + ziv_extra_bits; + /* XXX: rigorous error analysis needed */ + extra_bits = ceil_log2(e) * 2 + 1; + ret = bf_pow_ui_ui(r, radix, e, prec + extra_bits, + BF_RNDN | BF_FLAG_EXT_EXP); + if (!e_sign) + ret |= bf_mul(r, r, a, prec + extra_bits, + BF_RNDN | BF_FLAG_EXT_EXP); + else + ret |= bf_div(r, a, r, prec + extra_bits, + BF_RNDN | BF_FLAG_EXT_EXP); + if (ret & BF_ST_MEM_ERROR) + return BF_ST_MEM_ERROR; + /* if the result is not exact, check that it can be safely + rounded to an integer */ + if ((ret & BF_ST_INEXACT) && + !bf_can_round(r, r->expn, rnd_mode, prec)) { + /* and more precision and retry */ + ziv_extra_bits = ziv_extra_bits + (ziv_extra_bits / 2); + continue; + } else { + ret = bf_rint(r, rnd_mode); + if (ret & BF_ST_MEM_ERROR) + return BF_ST_MEM_ERROR; + break; + } + } + if (is_fixed_exponent) + break; + /* check that the result is < B^P */ + /* XXX: do a fast approximate test first ? */ + bf_init(r->ctx, B); + ret = bf_pow_ui_ui(B, radix, P, BF_PREC_INF, BF_RNDZ); + if (ret) { + bf_delete(B); + return ret; + } + res = bf_cmpu(r, B); + bf_delete(B); + if (res < 0) + break; + /* try a larger exponent */ + E++; + } + *pE = E; + return 0; +} + +static void limb_to_a(char *buf, limb_t n, unsigned int radix, int len) +{ + int digit, i; + + if (radix == 10) { + /* specific case with constant divisor */ + for(i = len - 1; i >= 0; i--) { + digit = (limb_t)n % 10; + n = (limb_t)n / 10; + buf[i] = digit + '0'; + } + } else { + for(i = len - 1; i >= 0; i--) { + digit = (limb_t)n % radix; + n = (limb_t)n / radix; + if (digit < 10) + digit += '0'; + else + digit += 'a' - 10; + buf[i] = digit; + } + } +} + +/* for power of 2 radixes */ +static void limb_to_a2(char *buf, limb_t n, unsigned int radix_bits, int len) +{ + int digit, i; + unsigned int mask; + + mask = (1 << radix_bits) - 1; + for(i = len - 1; i >= 0; i--) { + digit = n & mask; + n >>= radix_bits; + if (digit < 10) + digit += '0'; + else + digit += 'a' - 10; + buf[i] = digit; + } +} + +/* 'a' must be an integer if the is_dec = FALSE or if the radix is not + a power of two. A dot is added before the 'dot_pos' digit. dot_pos + = n_digits does not display the dot. 0 <= dot_pos <= + n_digits. n_digits >= 1. */ +static void output_digits(DynBuf *s, const bf_t *a1, int radix, limb_t n_digits, + limb_t dot_pos, BOOL is_dec) +{ + limb_t i, v, l; + slimb_t pos, pos_incr; + int digits_per_limb, buf_pos, radix_bits, first_buf_pos; + char buf[65]; + bf_t a_s, *a; + + if (is_dec) { + digits_per_limb = LIMB_DIGITS; + a = (bf_t *)a1; + radix_bits = 0; + pos = a->len; + pos_incr = 1; + first_buf_pos = 0; + } else if ((radix & (radix - 1)) == 0) { + a = (bf_t *)a1; + radix_bits = ceil_log2(radix); + digits_per_limb = LIMB_BITS / radix_bits; + pos_incr = digits_per_limb * radix_bits; + /* digits are aligned relative to the radix point */ + pos = a->len * LIMB_BITS + smod(-a->expn, radix_bits); + first_buf_pos = 0; + } else { + limb_t n, radixl; + + digits_per_limb = digits_per_limb_table[radix - 2]; + radixl = get_limb_radix(radix); + a = &a_s; + bf_init(a1->ctx, a); + n = (n_digits + digits_per_limb - 1) / digits_per_limb; + if (bf_resize(a, n)) { + dbuf_set_error(s); + goto done; + } + if (bf_integer_to_radix(a, a1, radixl)) { + dbuf_set_error(s); + goto done; + } + radix_bits = 0; + pos = n; + pos_incr = 1; + first_buf_pos = pos * digits_per_limb - n_digits; + } + buf_pos = digits_per_limb; + i = 0; + while (i < n_digits) { + if (buf_pos == digits_per_limb) { + pos -= pos_incr; + if (radix_bits == 0) { + v = get_limbz(a, pos); + limb_to_a(buf, v, radix, digits_per_limb); + } else { + v = get_bits(a->tab, a->len, pos); + limb_to_a2(buf, v, radix_bits, digits_per_limb); + } + buf_pos = first_buf_pos; + first_buf_pos = 0; + } + if (i < dot_pos) { + l = dot_pos; + } else { + if (i == dot_pos) + dbuf_putc(s, '.'); + l = n_digits; + } + l = bf_min(digits_per_limb - buf_pos, l - i); + dbuf_put(s, (uint8_t *)(buf + buf_pos), l); + buf_pos += l; + i += l; + } + done: + if (a != a1) + bf_delete(a); +} + +static void *bf_dbuf_realloc(void *opaque, void *ptr, size_t size) +{ + bf_context_t *s = opaque; + return bf_realloc(s, ptr, size); +} + +/* return the length in bytes. A trailing '\0' is added */ +static char *bf_ftoa_internal(size_t *plen, const bf_t *a2, int radix, + limb_t prec, bf_flags_t flags, BOOL is_dec) +{ + bf_context_t *ctx = a2->ctx; + DynBuf s_s, *s = &s_s; + int radix_bits; + + // bf_print_str("ftoa", a2); + // printf("radix=%d\n", radix); + dbuf_init2(s, ctx, bf_dbuf_realloc); + if (a2->expn == BF_EXP_NAN) { + dbuf_putstr(s, "NaN"); + } else { + if (a2->sign) + dbuf_putc(s, '-'); + if (a2->expn == BF_EXP_INF) { + if (flags & BF_FTOA_JS_QUIRKS) + dbuf_putstr(s, "Infinity"); + else + dbuf_putstr(s, "Inf"); + } else { + int fmt, ret; + slimb_t n_digits, n, i, n_max, n1; + bf_t a1_s, *a1 = &a1_s; + + if ((radix & (radix - 1)) != 0) + radix_bits = 0; + else + radix_bits = ceil_log2(radix); + + fmt = flags & BF_FTOA_FORMAT_MASK; + bf_init(ctx, a1); + if (fmt == BF_FTOA_FORMAT_FRAC) { + if (is_dec || radix_bits != 0) { + if (bf_set(a1, a2)) + goto fail1; +#ifdef USE_BF_DEC + if (is_dec) { + if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR) + goto fail1; + n = a1->expn; + } else +#endif + { + if (bf_round(a1, prec * radix_bits, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR) + goto fail1; + n = ceil_div(a1->expn, radix_bits); + } + if (flags & BF_FTOA_ADD_PREFIX) { + if (radix == 16) + dbuf_putstr(s, "0x"); + else if (radix == 8) + dbuf_putstr(s, "0o"); + else if (radix == 2) + dbuf_putstr(s, "0b"); + } + if (a1->expn == BF_EXP_ZERO) { + dbuf_putstr(s, "0"); + if (prec > 0) { + dbuf_putstr(s, "."); + for(i = 0; i < prec; i++) { + dbuf_putc(s, '0'); + } + } + } else { + n_digits = prec + n; + if (n <= 0) { + /* 0.x */ + dbuf_putstr(s, "0."); + for(i = 0; i < -n; i++) { + dbuf_putc(s, '0'); + } + if (n_digits > 0) { + output_digits(s, a1, radix, n_digits, n_digits, is_dec); + } + } else { + output_digits(s, a1, radix, n_digits, n, is_dec); + } + } + } else { + size_t pos, start; + bf_t a_s, *a = &a_s; + + /* make a positive number */ + a->tab = a2->tab; + a->len = a2->len; + a->expn = a2->expn; + a->sign = 0; + + /* one more digit for the rounding */ + n = 1 + bf_mul_log2_radix(bf_max(a->expn, 0), radix, TRUE, TRUE); + n_digits = n + prec; + n1 = n; + if (bf_convert_to_radix(a1, &n1, a, radix, n_digits, + flags & BF_RND_MASK, TRUE)) + goto fail1; + start = s->size; + output_digits(s, a1, radix, n_digits, n, is_dec); + /* remove leading zeros because we allocated one more digit */ + pos = start; + while ((pos + 1) < s->size && s->buf[pos] == '0' && + s->buf[pos + 1] != '.') + pos++; + if (pos > start) { + memmove(s->buf + start, s->buf + pos, s->size - pos); + s->size -= (pos - start); + } + } + } else { +#ifdef USE_BF_DEC + if (is_dec) { + if (bf_set(a1, a2)) + goto fail1; + if (fmt == BF_FTOA_FORMAT_FIXED) { + n_digits = prec; + n_max = n_digits; + if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR) + goto fail1; + } else { + /* prec is ignored */ + prec = n_digits = a1->len * LIMB_DIGITS; + /* remove the trailing zero digits */ + while (n_digits > 1 && + get_digit(a1->tab, a1->len, prec - n_digits) == 0) { + n_digits--; + } + n_max = n_digits + 4; + } + n = a1->expn; + } else +#endif + if (radix_bits != 0) { + if (bf_set(a1, a2)) + goto fail1; + if (fmt == BF_FTOA_FORMAT_FIXED) { + slimb_t prec_bits; + n_digits = prec; + n_max = n_digits; + /* align to the radix point */ + prec_bits = prec * radix_bits - + smod(-a1->expn, radix_bits); + if (bf_round(a1, prec_bits, + (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR) + goto fail1; + } else { + limb_t digit_mask; + slimb_t pos; + /* position of the digit before the most + significant digit in bits */ + pos = a1->len * LIMB_BITS + + smod(-a1->expn, radix_bits); + n_digits = ceil_div(pos, radix_bits); + /* remove the trailing zero digits */ + digit_mask = ((limb_t)1 << radix_bits) - 1; + while (n_digits > 1 && + (get_bits(a1->tab, a1->len, pos - n_digits * radix_bits) & digit_mask) == 0) { + n_digits--; + } + n_max = n_digits + 4; + } + n = ceil_div(a1->expn, radix_bits); + } else { + bf_t a_s, *a = &a_s; + + /* make a positive number */ + a->tab = a2->tab; + a->len = a2->len; + a->expn = a2->expn; + a->sign = 0; + + if (fmt == BF_FTOA_FORMAT_FIXED) { + n_digits = prec; + n_max = n_digits; + } else { + slimb_t n_digits_max, n_digits_min; + + assert(prec != BF_PREC_INF); + n_digits = 1 + bf_mul_log2_radix(prec, radix, TRUE, TRUE); + /* max number of digits for non exponential + notation. The rational is to have the same rule + as JS i.e. n_max = 21 for 64 bit float in base 10. */ + n_max = n_digits + 4; + if (fmt == BF_FTOA_FORMAT_FREE_MIN) { + bf_t b_s, *b = &b_s; + + /* find the minimum number of digits by + dichotomy. */ + /* XXX: inefficient */ + n_digits_max = n_digits; + n_digits_min = 1; + bf_init(ctx, b); + while (n_digits_min < n_digits_max) { + n_digits = (n_digits_min + n_digits_max) / 2; + if (bf_convert_to_radix(a1, &n, a, radix, n_digits, + flags & BF_RND_MASK, FALSE)) { + bf_delete(b); + goto fail1; + } + /* convert back to a number and compare */ + ret = bf_mul_pow_radix(b, a1, radix, n - n_digits, + prec, + (flags & ~BF_RND_MASK) | + BF_RNDN); + if (ret & BF_ST_MEM_ERROR) { + bf_delete(b); + goto fail1; + } + if (bf_cmpu(b, a) == 0) { + n_digits_max = n_digits; + } else { + n_digits_min = n_digits + 1; + } + } + bf_delete(b); + n_digits = n_digits_max; + } + } + if (bf_convert_to_radix(a1, &n, a, radix, n_digits, + flags & BF_RND_MASK, FALSE)) { + fail1: + bf_delete(a1); + goto fail; + } + } + if (a1->expn == BF_EXP_ZERO && + fmt != BF_FTOA_FORMAT_FIXED && + !(flags & BF_FTOA_FORCE_EXP)) { + /* just output zero */ + dbuf_putstr(s, "0"); + } else { + if (flags & BF_FTOA_ADD_PREFIX) { + if (radix == 16) + dbuf_putstr(s, "0x"); + else if (radix == 8) + dbuf_putstr(s, "0o"); + else if (radix == 2) + dbuf_putstr(s, "0b"); + } + if (a1->expn == BF_EXP_ZERO) + n = 1; + if ((flags & BF_FTOA_FORCE_EXP) || + n <= -6 || n > n_max) { + const char *fmt; + /* exponential notation */ + output_digits(s, a1, radix, n_digits, 1, is_dec); + if (radix_bits != 0 && radix <= 16) { + if (flags & BF_FTOA_JS_QUIRKS) + fmt = "p%+" PRId_LIMB; + else + fmt = "p%" PRId_LIMB; + dbuf_printf(s, fmt, (n - 1) * radix_bits); + } else { + if (flags & BF_FTOA_JS_QUIRKS) + fmt = "%c%+" PRId_LIMB; + else + fmt = "%c%" PRId_LIMB; + dbuf_printf(s, fmt, + radix <= 10 ? 'e' : '@', n - 1); + } + } else if (n <= 0) { + /* 0.x */ + dbuf_putstr(s, "0."); + for(i = 0; i < -n; i++) { + dbuf_putc(s, '0'); + } + output_digits(s, a1, radix, n_digits, n_digits, is_dec); + } else { + if (n_digits <= n) { + /* no dot */ + output_digits(s, a1, radix, n_digits, n_digits, is_dec); + for(i = 0; i < (n - n_digits); i++) + dbuf_putc(s, '0'); + } else { + output_digits(s, a1, radix, n_digits, n, is_dec); + } + } + } + } + bf_delete(a1); + } + } + dbuf_putc(s, '\0'); + if (dbuf_error(s)) + goto fail; + if (plen) + *plen = s->size - 1; + return (char *)s->buf; + fail: + bf_free(ctx, s->buf); + if (plen) + *plen = 0; + return NULL; +} + +char *bf_ftoa(size_t *plen, const bf_t *a, int radix, limb_t prec, + bf_flags_t flags) +{ + return bf_ftoa_internal(plen, a, radix, prec, flags, FALSE); +} + +/***************************************************************/ +/* transcendental functions */ + +/* Note: the algorithm is from MPFR */ +static void bf_const_log2_rec(bf_t *T, bf_t *P, bf_t *Q, limb_t n1, + limb_t n2, BOOL need_P) +{ + bf_context_t *s = T->ctx; + if ((n2 - n1) == 1) { + if (n1 == 0) { + bf_set_ui(P, 3); + } else { + bf_set_ui(P, n1); + P->sign = 1; + } + bf_set_ui(Q, 2 * n1 + 1); + Q->expn += 2; + bf_set(T, P); + } else { + limb_t m; + bf_t T1_s, *T1 = &T1_s; + bf_t P1_s, *P1 = &P1_s; + bf_t Q1_s, *Q1 = &Q1_s; + + m = n1 + ((n2 - n1) >> 1); + bf_const_log2_rec(T, P, Q, n1, m, TRUE); + bf_init(s, T1); + bf_init(s, P1); + bf_init(s, Q1); + bf_const_log2_rec(T1, P1, Q1, m, n2, need_P); + bf_mul(T, T, Q1, BF_PREC_INF, BF_RNDZ); + bf_mul(T1, T1, P, BF_PREC_INF, BF_RNDZ); + bf_add(T, T, T1, BF_PREC_INF, BF_RNDZ); + if (need_P) + bf_mul(P, P, P1, BF_PREC_INF, BF_RNDZ); + bf_mul(Q, Q, Q1, BF_PREC_INF, BF_RNDZ); + bf_delete(T1); + bf_delete(P1); + bf_delete(Q1); + } +} + +/* compute log(2) with faithful rounding at precision 'prec' */ +static void bf_const_log2_internal(bf_t *T, limb_t prec) +{ + limb_t w, N; + bf_t P_s, *P = &P_s; + bf_t Q_s, *Q = &Q_s; + + w = prec + 15; + N = w / 3 + 1; + bf_init(T->ctx, P); + bf_init(T->ctx, Q); + bf_const_log2_rec(T, P, Q, 0, N, FALSE); + bf_div(T, T, Q, prec, BF_RNDN); + bf_delete(P); + bf_delete(Q); +} + +/* PI constant */ + +#define CHUD_A 13591409 +#define CHUD_B 545140134 +#define CHUD_C 640320 +#define CHUD_BITS_PER_TERM 47 + +static void chud_bs(bf_t *P, bf_t *Q, bf_t *G, int64_t a, int64_t b, int need_g, + limb_t prec) +{ + bf_context_t *s = P->ctx; + int64_t c; + + if (a == (b - 1)) { + bf_t T0, T1; + + bf_init(s, &T0); + bf_init(s, &T1); + bf_set_ui(G, 2 * b - 1); + bf_mul_ui(G, G, 6 * b - 1, prec, BF_RNDN); + bf_mul_ui(G, G, 6 * b - 5, prec, BF_RNDN); + bf_set_ui(&T0, CHUD_B); + bf_mul_ui(&T0, &T0, b, prec, BF_RNDN); + bf_set_ui(&T1, CHUD_A); + bf_add(&T0, &T0, &T1, prec, BF_RNDN); + bf_mul(P, G, &T0, prec, BF_RNDN); + P->sign = b & 1; + + bf_set_ui(Q, b); + bf_mul_ui(Q, Q, b, prec, BF_RNDN); + bf_mul_ui(Q, Q, b, prec, BF_RNDN); + bf_mul_ui(Q, Q, (uint64_t)CHUD_C * CHUD_C * CHUD_C / 24, prec, BF_RNDN); + bf_delete(&T0); + bf_delete(&T1); + } else { + bf_t P2, Q2, G2; + + bf_init(s, &P2); + bf_init(s, &Q2); + bf_init(s, &G2); + + c = (a + b) / 2; + chud_bs(P, Q, G, a, c, 1, prec); + chud_bs(&P2, &Q2, &G2, c, b, need_g, prec); + + /* Q = Q1 * Q2 */ + /* G = G1 * G2 */ + /* P = P1 * Q2 + P2 * G1 */ + bf_mul(&P2, &P2, G, prec, BF_RNDN); + if (!need_g) + bf_set_ui(G, 0); + bf_mul(P, P, &Q2, prec, BF_RNDN); + bf_add(P, P, &P2, prec, BF_RNDN); + bf_delete(&P2); + + bf_mul(Q, Q, &Q2, prec, BF_RNDN); + bf_delete(&Q2); + if (need_g) + bf_mul(G, G, &G2, prec, BF_RNDN); + bf_delete(&G2); + } +} + +/* compute Pi with faithful rounding at precision 'prec' using the + Chudnovsky formula */ +static void bf_const_pi_internal(bf_t *Q, limb_t prec) +{ + bf_context_t *s = Q->ctx; + int64_t n, prec1; + bf_t P, G; + + /* number of serie terms */ + n = prec / CHUD_BITS_PER_TERM + 1; + /* XXX: precision analysis */ + prec1 = prec + 32; + + bf_init(s, &P); + bf_init(s, &G); + + chud_bs(&P, Q, &G, 0, n, 0, BF_PREC_INF); + + bf_mul_ui(&G, Q, CHUD_A, prec1, BF_RNDN); + bf_add(&P, &G, &P, prec1, BF_RNDN); + bf_div(Q, Q, &P, prec1, BF_RNDF); + + bf_set_ui(&P, CHUD_C); + bf_sqrt(&G, &P, prec1, BF_RNDF); + bf_mul_ui(&G, &G, (uint64_t)CHUD_C / 12, prec1, BF_RNDF); + bf_mul(Q, Q, &G, prec, BF_RNDN); + bf_delete(&P); + bf_delete(&G); +} + +static int bf_const_get(bf_t *T, limb_t prec, bf_flags_t flags, + BFConstCache *c, + void (*func)(bf_t *res, limb_t prec), int sign) +{ + limb_t ziv_extra_bits, prec1; + + ziv_extra_bits = 32; + for(;;) { + prec1 = prec + ziv_extra_bits; + if (c->prec < prec1) { + if (c->val.len == 0) + bf_init(T->ctx, &c->val); + func(&c->val, prec1); + c->prec = prec1; + } else { + prec1 = c->prec; + } + bf_set(T, &c->val); + T->sign = sign; + if (!bf_can_round(T, prec, flags & BF_RND_MASK, prec1)) { + /* and more precision and retry */ + ziv_extra_bits = ziv_extra_bits + (ziv_extra_bits / 2); + } else { + break; + } + } + return bf_round(T, prec, flags); +} + +static void bf_const_free(BFConstCache *c) +{ + bf_delete(&c->val); + memset(c, 0, sizeof(*c)); +} + +int bf_const_log2(bf_t *T, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = T->ctx; + return bf_const_get(T, prec, flags, &s->log2_cache, bf_const_log2_internal, 0); +} + +/* return rounded pi * (1 - 2 * sign) */ +static int bf_const_pi_signed(bf_t *T, int sign, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = T->ctx; + return bf_const_get(T, prec, flags, &s->pi_cache, bf_const_pi_internal, + sign); +} + +int bf_const_pi(bf_t *T, limb_t prec, bf_flags_t flags) +{ + return bf_const_pi_signed(T, 0, prec, flags); +} + +void bf_clear_cache(bf_context_t *s) +{ +#ifdef USE_FFT_MUL + fft_clear_cache(s); +#endif + bf_const_free(&s->log2_cache); + bf_const_free(&s->pi_cache); +} + +/* ZivFunc should compute the result 'r' with faithful rounding at + precision 'prec'. For efficiency purposes, the final bf_round() + does not need to be done in the function. */ +typedef int ZivFunc(bf_t *r, const bf_t *a, limb_t prec, void *opaque); + +static int bf_ziv_rounding(bf_t *r, const bf_t *a, + limb_t prec, bf_flags_t flags, + ZivFunc *f, void *opaque) +{ + int rnd_mode, ret; + slimb_t prec1, ziv_extra_bits; + + rnd_mode = flags & BF_RND_MASK; + if (rnd_mode == BF_RNDF) { + /* no need to iterate */ + f(r, a, prec, opaque); + ret = 0; + } else { + ziv_extra_bits = 32; + for(;;) { + prec1 = prec + ziv_extra_bits; + ret = f(r, a, prec1, opaque); + if (ret & (BF_ST_OVERFLOW | BF_ST_UNDERFLOW | BF_ST_MEM_ERROR)) { + /* overflow or underflow should never happen because + it indicates the rounding cannot be done correctly, + but we do not catch all the cases */ + return ret; + } + /* if the result is exact, we can stop */ + if (!(ret & BF_ST_INEXACT)) { + ret = 0; + break; + } + if (bf_can_round(r, prec, rnd_mode, prec1)) { + ret = BF_ST_INEXACT; + break; + } + ziv_extra_bits = ziv_extra_bits * 2; + // printf("ziv_extra_bits=%" PRId64 "\n", (int64_t)ziv_extra_bits); + } + } + if (r->len == 0) + return ret; + else + return __bf_round(r, prec, flags, r->len, ret); +} + +/* add (1 - 2*e_sign) * 2^e */ +static int bf_add_epsilon(bf_t *r, const bf_t *a, slimb_t e, int e_sign, + limb_t prec, int flags) +{ + bf_t T_s, *T = &T_s; + int ret; + /* small argument case: result = 1 + epsilon * sign(x) */ + bf_init(a->ctx, T); + bf_set_ui(T, 1); + T->sign = e_sign; + T->expn += e; + ret = bf_add(r, r, T, prec, flags); + bf_delete(T); + return ret; +} + +/* Compute the exponential using faithful rounding at precision 'prec'. + Note: the algorithm is from MPFR */ +static int bf_exp_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + slimb_t n, K, l, i, prec1; + + assert(r != a); + + /* argument reduction: + T = a - n*log(2) with 0 <= T < log(2) and n integer. + */ + bf_init(s, T); + if (a->expn <= -1) { + /* 0 <= abs(a) <= 0.5 */ + if (a->sign) + n = -1; + else + n = 0; + } else { + bf_const_log2(T, LIMB_BITS, BF_RNDZ); + bf_div(T, a, T, LIMB_BITS, BF_RNDD); + bf_get_limb(&n, T, 0); + } + + K = bf_isqrt((prec + 1) / 2); + l = (prec - 1) / K + 1; + /* XXX: precision analysis ? */ + prec1 = prec + (K + 2 * l + 18) + K + 8; + if (a->expn > 0) + prec1 += a->expn; + // printf("n=%ld K=%ld prec1=%ld\n", n, K, prec1); + + bf_const_log2(T, prec1, BF_RNDF); + bf_mul_si(T, T, n, prec1, BF_RNDN); + bf_sub(T, a, T, prec1, BF_RNDN); + + /* reduce the range of T */ + bf_mul_2exp(T, -K, BF_PREC_INF, BF_RNDZ); + + /* Taylor expansion around zero : + 1 + x + x^2/2 + ... + x^n/n! + = (1 + x * (1 + x/2 * (1 + ... (x/n)))) + */ + { + bf_t U_s, *U = &U_s; + + bf_init(s, U); + bf_set_ui(r, 1); + for(i = l ; i >= 1; i--) { + bf_set_ui(U, i); + bf_div(U, T, U, prec1, BF_RNDN); + bf_mul(r, r, U, prec1, BF_RNDN); + bf_add_si(r, r, 1, prec1, BF_RNDN); + } + bf_delete(U); + } + bf_delete(T); + + /* undo the range reduction */ + for(i = 0; i < K; i++) { + bf_mul(r, r, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP); + } + + /* undo the argument reduction */ + bf_mul_2exp(r, n, BF_PREC_INF, BF_RNDZ | BF_FLAG_EXT_EXP); + + return BF_ST_INEXACT; +} + +/* crude overflow and underflow tests for exp(a). a_low <= a <= a_high */ +static int check_exp_underflow_overflow(bf_context_t *s, bf_t *r, + const bf_t *a_low, const bf_t *a_high, + limb_t prec, bf_flags_t flags) +{ + bf_t T_s, *T = &T_s; + bf_t log2_s, *log2 = &log2_s; + slimb_t e_min, e_max; + + if (a_high->expn <= 0) + return 0; + + e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1); + e_min = -e_max + 3; + if (flags & BF_FLAG_SUBNORMAL) + e_min -= (prec - 1); + + bf_init(s, T); + bf_init(s, log2); + bf_const_log2(log2, LIMB_BITS, BF_RNDU); + bf_mul_ui(T, log2, e_max, LIMB_BITS, BF_RNDU); + /* a_low > e_max * log(2) implies exp(a) > e_max */ + if (bf_cmp_lt(T, a_low) > 0) { + /* overflow */ + bf_delete(T); + bf_delete(log2); + return bf_set_overflow(r, 0, prec, flags); + } + /* a_high < (e_min - 2) * log(2) implies exp(a) < (e_min - 2) */ + bf_const_log2(log2, LIMB_BITS, BF_RNDD); + bf_mul_si(T, log2, e_min - 2, LIMB_BITS, BF_RNDD); + if (bf_cmp_lt(a_high, T)) { + int rnd_mode = flags & BF_RND_MASK; + + /* underflow */ + bf_delete(T); + bf_delete(log2); + if (rnd_mode == BF_RNDU) { + /* set the smallest value */ + bf_set_ui(r, 1); + r->expn = e_min; + } else { + bf_set_zero(r, 0); + } + return BF_ST_UNDERFLOW | BF_ST_INEXACT; + } + bf_delete(log2); + bf_delete(T); + return 0; +} + +int bf_exp(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = r->ctx; + int ret; + assert(r != a); + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + } else if (a->expn == BF_EXP_INF) { + if (a->sign) + bf_set_zero(r, 0); + else + bf_set_inf(r, 0); + } else { + bf_set_ui(r, 1); + } + return 0; + } + + ret = check_exp_underflow_overflow(s, r, a, a, prec, flags); + if (ret) + return ret; + if (a->expn < 0 && (-a->expn) >= (prec + 2)) { + /* small argument case: result = 1 + epsilon * sign(x) */ + bf_set_ui(r, 1); + return bf_add_epsilon(r, r, -(prec + 2), a->sign, prec, flags); + } + + return bf_ziv_rounding(r, a, prec, flags, bf_exp_internal, NULL); +} + +static int bf_log_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + bf_t U_s, *U = &U_s; + bf_t V_s, *V = &V_s; + slimb_t n, prec1, l, i, K; + + assert(r != a); + + bf_init(s, T); + /* argument reduction 1 */ + /* T=a*2^n with 2/3 <= T <= 4/3 */ + { + bf_t U_s, *U = &U_s; + bf_set(T, a); + n = T->expn; + T->expn = 0; + /* U= ~ 2/3 */ + bf_init(s, U); + bf_set_ui(U, 0xaaaaaaaa); + U->expn = 0; + if (bf_cmp_lt(T, U)) { + T->expn++; + n--; + } + bf_delete(U); + } + // printf("n=%ld\n", n); + // bf_print_str("T", T); + + /* XXX: precision analysis */ + /* number of iterations for argument reduction 2 */ + K = bf_isqrt((prec + 1) / 2); + /* order of Taylor expansion */ + l = prec / (2 * K) + 1; + /* precision of the intermediate computations */ + prec1 = prec + K + 2 * l + 32; + + bf_init(s, U); + bf_init(s, V); + + /* Note: cancellation occurs here, so we use more precision (XXX: + reduce the precision by computing the exact cancellation) */ + bf_add_si(T, T, -1, BF_PREC_INF, BF_RNDN); + + /* argument reduction 2 */ + for(i = 0; i < K; i++) { + /* T = T / (1 + sqrt(1 + T)) */ + bf_add_si(U, T, 1, prec1, BF_RNDN); + bf_sqrt(V, U, prec1, BF_RNDF); + bf_add_si(U, V, 1, prec1, BF_RNDN); + bf_div(T, T, U, prec1, BF_RNDN); + } + + { + bf_t Y_s, *Y = &Y_s; + bf_t Y2_s, *Y2 = &Y2_s; + bf_init(s, Y); + bf_init(s, Y2); + + /* compute ln(1+x) = ln((1+y)/(1-y)) with y=x/(2+x) + = y + y^3/3 + ... + y^(2*l + 1) / (2*l+1) + with Y=Y^2 + = y*(1+Y/3+Y^2/5+...) = y*(1+Y*(1/3+Y*(1/5 + ...))) + */ + bf_add_si(Y, T, 2, prec1, BF_RNDN); + bf_div(Y, T, Y, prec1, BF_RNDN); + + bf_mul(Y2, Y, Y, prec1, BF_RNDN); + bf_set_ui(r, 0); + for(i = l; i >= 1; i--) { + bf_set_ui(U, 1); + bf_set_ui(V, 2 * i + 1); + bf_div(U, U, V, prec1, BF_RNDN); + bf_add(r, r, U, prec1, BF_RNDN); + bf_mul(r, r, Y2, prec1, BF_RNDN); + } + bf_add_si(r, r, 1, prec1, BF_RNDN); + bf_mul(r, r, Y, prec1, BF_RNDN); + bf_delete(Y); + bf_delete(Y2); + } + bf_delete(V); + bf_delete(U); + + /* multiplication by 2 for the Taylor expansion and undo the + argument reduction 2*/ + bf_mul_2exp(r, K + 1, BF_PREC_INF, BF_RNDZ); + + /* undo the argument reduction 1 */ + bf_const_log2(T, prec1, BF_RNDF); + bf_mul_si(T, T, n, prec1, BF_RNDN); + bf_add(r, r, T, prec1, BF_RNDN); + + bf_delete(T); + return BF_ST_INEXACT; +} + +int bf_log(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + + assert(r != a); + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF) { + if (a->sign) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_set_inf(r, 0); + return 0; + } + } else { + bf_set_inf(r, 1); + return 0; + } + } + if (a->sign) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } + bf_init(s, T); + bf_set_ui(T, 1); + if (bf_cmp_eq(a, T)) { + bf_set_zero(r, 0); + bf_delete(T); + return 0; + } + bf_delete(T); + + return bf_ziv_rounding(r, a, prec, flags, bf_log_internal, NULL); +} + +/* x and y finite and x > 0 */ +static int bf_pow_generic(bf_t *r, const bf_t *x, limb_t prec, void *opaque) +{ + bf_context_t *s = r->ctx; + const bf_t *y = opaque; + bf_t T_s, *T = &T_s; + limb_t prec1; + + bf_init(s, T); + /* XXX: proof for the added precision */ + prec1 = prec + 32; + bf_log(T, x, prec1, BF_RNDF | BF_FLAG_EXT_EXP); + bf_mul(T, T, y, prec1, BF_RNDF | BF_FLAG_EXT_EXP); + if (bf_is_nan(T)) + bf_set_nan(r); + else + bf_exp_internal(r, T, prec1, NULL); /* no overflow/underlow test needed */ + bf_delete(T); + return BF_ST_INEXACT; +} + +/* x and y finite, x > 0, y integer and y fits on one limb */ +static int bf_pow_int(bf_t *r, const bf_t *x, limb_t prec, void *opaque) +{ + bf_context_t *s = r->ctx; + const bf_t *y = opaque; + bf_t T_s, *T = &T_s; + limb_t prec1; + int ret; + slimb_t y1; + + bf_get_limb(&y1, y, 0); + if (y1 < 0) + y1 = -y1; + /* XXX: proof for the added precision */ + prec1 = prec + ceil_log2(y1) * 2 + 8; + ret = bf_pow_ui(r, x, y1 < 0 ? -y1 : y1, prec1, BF_RNDN | BF_FLAG_EXT_EXP); + if (y->sign) { + bf_init(s, T); + bf_set_ui(T, 1); + ret |= bf_div(r, T, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP); + bf_delete(T); + } + return ret; +} + +/* x must be a finite non zero float. Return TRUE if there is a + floating point number r such as x=r^(2^n) and return this floating + point number 'r'. Otherwise return FALSE and r is undefined. */ +static BOOL check_exact_power2n(bf_t *r, const bf_t *x, slimb_t n) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + slimb_t e, i, er; + limb_t v; + + /* x = m*2^e with m odd integer */ + e = bf_get_exp_min(x); + /* fast check on the exponent */ + if (n > (LIMB_BITS - 1)) { + if (e != 0) + return FALSE; + er = 0; + } else { + if ((e & (((limb_t)1 << n) - 1)) != 0) + return FALSE; + er = e >> n; + } + /* every perfect odd square = 1 modulo 8 */ + v = get_bits(x->tab, x->len, x->len * LIMB_BITS - x->expn + e); + if ((v & 7) != 1) + return FALSE; + + bf_init(s, T); + bf_set(T, x); + T->expn -= e; + for(i = 0; i < n; i++) { + if (i != 0) + bf_set(T, r); + if (bf_sqrtrem(r, NULL, T) != 0) + return FALSE; + } + r->expn += er; + return TRUE; +} + +/* prec = BF_PREC_INF is accepted for x and y integers and y >= 0 */ +int bf_pow(bf_t *r, const bf_t *x, const bf_t *y, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + bf_t ytmp_s; + BOOL y_is_int, y_is_odd; + int r_sign, ret, rnd_mode; + slimb_t y_emin; + + if (x->len == 0 || y->len == 0) { + if (y->expn == BF_EXP_ZERO) { + /* pow(x, 0) = 1 */ + bf_set_ui(r, 1); + } else if (x->expn == BF_EXP_NAN) { + bf_set_nan(r); + } else { + int cmp_x_abs_1; + bf_set_ui(r, 1); + cmp_x_abs_1 = bf_cmpu(x, r); + if (cmp_x_abs_1 == 0 && (flags & BF_POW_JS_QUIRKS) && + (y->expn >= BF_EXP_INF)) { + bf_set_nan(r); + } else if (cmp_x_abs_1 == 0 && + (!x->sign || y->expn != BF_EXP_NAN)) { + /* pow(1, y) = 1 even if y = NaN */ + /* pow(-1, +/-inf) = 1 */ + } else if (y->expn == BF_EXP_NAN) { + bf_set_nan(r); + } else if (y->expn == BF_EXP_INF) { + if (y->sign == (cmp_x_abs_1 > 0)) { + bf_set_zero(r, 0); + } else { + bf_set_inf(r, 0); + } + } else { + y_emin = bf_get_exp_min(y); + y_is_odd = (y_emin == 0); + if (y->sign == (x->expn == BF_EXP_ZERO)) { + bf_set_inf(r, y_is_odd & x->sign); + if (y->sign) { + /* pow(0, y) with y < 0 */ + return BF_ST_DIVIDE_ZERO; + } + } else { + bf_set_zero(r, y_is_odd & x->sign); + } + } + } + return 0; + } + bf_init(s, T); + bf_set(T, x); + y_emin = bf_get_exp_min(y); + y_is_int = (y_emin >= 0); + rnd_mode = flags & BF_RND_MASK; + if (x->sign) { + if (!y_is_int) { + bf_set_nan(r); + bf_delete(T); + return BF_ST_INVALID_OP; + } + y_is_odd = (y_emin == 0); + r_sign = y_is_odd; + /* change the directed rounding mode if the sign of the result + is changed */ + if (r_sign && (rnd_mode == BF_RNDD || rnd_mode == BF_RNDU)) + flags ^= 1; + bf_neg(T); + } else { + r_sign = 0; + } + + bf_set_ui(r, 1); + if (bf_cmp_eq(T, r)) { + /* abs(x) = 1: nothing more to do */ + ret = 0; + } else { + /* check the overflow/underflow cases */ + { + bf_t al_s, *al = &al_s; + bf_t ah_s, *ah = &ah_s; + limb_t precl = LIMB_BITS; + + bf_init(s, al); + bf_init(s, ah); + /* compute bounds of log(abs(x)) * y with a low precision */ + /* XXX: compute bf_log() once */ + /* XXX: add a fast test before this slow test */ + bf_log(al, T, precl, BF_RNDD); + bf_log(ah, T, precl, BF_RNDU); + bf_mul(al, al, y, precl, BF_RNDD ^ y->sign); + bf_mul(ah, ah, y, precl, BF_RNDU ^ y->sign); + ret = check_exp_underflow_overflow(s, r, al, ah, prec, flags); + bf_delete(al); + bf_delete(ah); + if (ret) + goto done; + } + + if (y_is_int) { + slimb_t T_bits, e; + int_pow: + T_bits = T->expn - bf_get_exp_min(T); + if (T_bits == 1) { + /* pow(2^b, y) = 2^(b*y) */ + bf_mul_si(T, y, T->expn - 1, LIMB_BITS, BF_RNDZ); + bf_get_limb(&e, T, 0); + bf_set_ui(r, 1); + ret = bf_mul_2exp(r, e, prec, flags); + } else if (prec == BF_PREC_INF) { + slimb_t y1; + /* specific case for infinite precision (integer case) */ + bf_get_limb(&y1, y, 0); + assert(!y->sign); + /* x must be an integer, so abs(x) >= 2 */ + if (y1 >= ((slimb_t)1 << BF_EXP_BITS_MAX)) { + bf_delete(T); + return bf_set_overflow(r, 0, BF_PREC_INF, flags); + } + ret = bf_pow_ui(r, T, y1, BF_PREC_INF, BF_RNDZ); + } else { + if (y->expn <= 31) { + /* small enough power: use exponentiation in all cases */ + } else if (y->sign) { + /* cannot be exact */ + goto general_case; + } else { + if (rnd_mode == BF_RNDF) + goto general_case; /* no need to track exact results */ + /* see if the result has a chance to be exact: + if x=a*2^b (a odd), x^y=a^y*2^(b*y) + x^y needs a precision of at least floor_log2(a)*y bits + */ + bf_mul_si(r, y, T_bits - 1, LIMB_BITS, BF_RNDZ); + bf_get_limb(&e, r, 0); + if (prec < e) + goto general_case; + } + ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_int, (void *)y); + } + } else { + if (rnd_mode != BF_RNDF) { + bf_t *y1; + if (y_emin < 0 && check_exact_power2n(r, T, -y_emin)) { + /* the problem is reduced to a power to an integer */ +#if 0 + printf("\nn=%" PRId64 "\n", -(int64_t)y_emin); + bf_print_str("T", T); + bf_print_str("r", r); +#endif + bf_set(T, r); + y1 = &ytmp_s; + y1->tab = y->tab; + y1->len = y->len; + y1->sign = y->sign; + y1->expn = y->expn - y_emin; + y = y1; + goto int_pow; + } + } + general_case: + ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_generic, (void *)y); + } + } + done: + bf_delete(T); + r->sign = r_sign; + return ret; +} + +/* compute sqrt(-2*x-x^2) to get |sin(x)| from cos(x) - 1. */ +static void bf_sqrt_sin(bf_t *r, const bf_t *x, limb_t prec1) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + bf_init(s, T); + bf_set(T, x); + bf_mul(r, T, T, prec1, BF_RNDN); + bf_mul_2exp(T, 1, BF_PREC_INF, BF_RNDZ); + bf_add(T, T, r, prec1, BF_RNDN); + bf_neg(T); + bf_sqrt(r, T, prec1, BF_RNDF); + bf_delete(T); +} + +static int bf_sincos(bf_t *s, bf_t *c, const bf_t *a, limb_t prec) +{ + bf_context_t *s1 = a->ctx; + bf_t T_s, *T = &T_s; + bf_t U_s, *U = &U_s; + bf_t r_s, *r = &r_s; + slimb_t K, prec1, i, l, mod, prec2; + int is_neg; + + assert(c != a && s != a); + + bf_init(s1, T); + bf_init(s1, U); + bf_init(s1, r); + + /* XXX: precision analysis */ + K = bf_isqrt(prec / 2); + l = prec / (2 * K) + 1; + prec1 = prec + 2 * K + l + 8; + + /* after the modulo reduction, -pi/4 <= T <= pi/4 */ + if (a->expn <= -1) { + /* abs(a) <= 0.25: no modulo reduction needed */ + bf_set(T, a); + mod = 0; + } else { + slimb_t cancel; + cancel = 0; + for(;;) { + prec2 = prec1 + a->expn + cancel; + bf_const_pi(U, prec2, BF_RNDF); + bf_mul_2exp(U, -1, BF_PREC_INF, BF_RNDZ); + bf_remquo(&mod, T, a, U, prec2, BF_RNDN, BF_RNDN); + // printf("T.expn=%ld prec2=%ld\n", T->expn, prec2); + if (mod == 0 || (T->expn != BF_EXP_ZERO && + (T->expn + prec2) >= (prec1 - 1))) + break; + /* increase the number of bits until the precision is good enough */ + cancel = bf_max(-T->expn, (cancel + 1) * 3 / 2); + } + mod &= 3; + } + + is_neg = T->sign; + + /* compute cosm1(x) = cos(x) - 1 */ + bf_mul(T, T, T, prec1, BF_RNDN); + bf_mul_2exp(T, -2 * K, BF_PREC_INF, BF_RNDZ); + + /* Taylor expansion: + -x^2/2 + x^4/4! - x^6/6! + ... + */ + bf_set_ui(r, 1); + for(i = l ; i >= 1; i--) { + bf_set_ui(U, 2 * i - 1); + bf_mul_ui(U, U, 2 * i, BF_PREC_INF, BF_RNDZ); + bf_div(U, T, U, prec1, BF_RNDN); + bf_mul(r, r, U, prec1, BF_RNDN); + bf_neg(r); + if (i != 1) + bf_add_si(r, r, 1, prec1, BF_RNDN); + } + bf_delete(U); + + /* undo argument reduction: + cosm1(2*x)= 2*(2*cosm1(x)+cosm1(x)^2) + */ + for(i = 0; i < K; i++) { + bf_mul(T, r, r, prec1, BF_RNDN); + bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ); + bf_add(r, r, T, prec1, BF_RNDN); + bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ); + } + bf_delete(T); + + if (c) { + if ((mod & 1) == 0) { + bf_add_si(c, r, 1, prec1, BF_RNDN); + } else { + bf_sqrt_sin(c, r, prec1); + c->sign = is_neg ^ 1; + } + c->sign ^= mod >> 1; + } + if (s) { + if ((mod & 1) == 0) { + bf_sqrt_sin(s, r, prec1); + s->sign = is_neg; + } else { + bf_add_si(s, r, 1, prec1, BF_RNDN); + } + s->sign ^= mod >> 1; + } + bf_delete(r); + return BF_ST_INEXACT; +} + +static int bf_cos_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) +{ + return bf_sincos(NULL, r, a, prec); +} + +int bf_cos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_set_ui(r, 1); + return 0; + } + } + + /* small argument case: result = 1+r(x) with r(x) = -x^2/2 + + O(X^4). We assume r(x) < 2^(2*EXP(x) - 1). */ + if (a->expn < 0) { + slimb_t e; + e = 2 * a->expn - 1; + if (e < -(prec + 2)) { + bf_set_ui(r, 1); + return bf_add_epsilon(r, r, e, 1, prec, flags); + } + } + + return bf_ziv_rounding(r, a, prec, flags, bf_cos_internal, NULL); +} + +static int bf_sin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) +{ + return bf_sincos(r, NULL, a, prec); +} + +int bf_sin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_set_zero(r, a->sign); + return 0; + } + } + + /* small argument case: result = x+r(x) with r(x) = -x^3/6 + + O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */ + if (a->expn < 0) { + slimb_t e; + e = sat_add(2 * a->expn, a->expn - 2); + if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { + bf_set(r, a); + return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags); + } + } + + return bf_ziv_rounding(r, a, prec, flags, bf_sin_internal, NULL); +} + +static int bf_tan_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + limb_t prec1; + + /* XXX: precision analysis */ + prec1 = prec + 8; + bf_init(s, T); + bf_sincos(r, T, a, prec1); + bf_div(r, r, T, prec1, BF_RNDF); + bf_delete(T); + return BF_ST_INEXACT; +} + +int bf_tan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + assert(r != a); + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_set_zero(r, a->sign); + return 0; + } + } + + /* small argument case: result = x+r(x) with r(x) = x^3/3 + + O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */ + if (a->expn < 0) { + slimb_t e; + e = sat_add(2 * a->expn, a->expn - 1); + if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { + bf_set(r, a); + return bf_add_epsilon(r, r, e, a->sign, prec, flags); + } + } + + return bf_ziv_rounding(r, a, prec, flags, bf_tan_internal, NULL); +} + +/* if add_pi2 is true, add pi/2 to the result (used for acos(x) to + avoid cancellation) */ +static int bf_atan_internal(bf_t *r, const bf_t *a, limb_t prec, + void *opaque) +{ + bf_context_t *s = r->ctx; + BOOL add_pi2 = (BOOL)(intptr_t)opaque; + bf_t T_s, *T = &T_s; + bf_t U_s, *U = &U_s; + bf_t V_s, *V = &V_s; + bf_t X2_s, *X2 = &X2_s; + int cmp_1; + slimb_t prec1, i, K, l; + + /* XXX: precision analysis */ + K = bf_isqrt((prec + 1) / 2); + l = prec / (2 * K) + 1; + prec1 = prec + K + 2 * l + 32; + // printf("prec=%d K=%d l=%d prec1=%d\n", (int)prec, (int)K, (int)l, (int)prec1); + + bf_init(s, T); + cmp_1 = (a->expn >= 1); /* a >= 1 */ + if (cmp_1) { + bf_set_ui(T, 1); + bf_div(T, T, a, prec1, BF_RNDN); + } else { + bf_set(T, a); + } + + /* abs(T) <= 1 */ + + /* argument reduction */ + + bf_init(s, U); + bf_init(s, V); + bf_init(s, X2); + for(i = 0; i < K; i++) { + /* T = T / (1 + sqrt(1 + T^2)) */ + bf_mul(U, T, T, prec1, BF_RNDN); + bf_add_si(U, U, 1, prec1, BF_RNDN); + bf_sqrt(V, U, prec1, BF_RNDN); + bf_add_si(V, V, 1, prec1, BF_RNDN); + bf_div(T, T, V, prec1, BF_RNDN); + } + + /* Taylor series: + x - x^3/3 + ... + (-1)^ l * y^(2*l + 1) / (2*l+1) + */ + bf_mul(X2, T, T, prec1, BF_RNDN); + bf_set_ui(r, 0); + for(i = l; i >= 1; i--) { + bf_set_si(U, 1); + bf_set_ui(V, 2 * i + 1); + bf_div(U, U, V, prec1, BF_RNDN); + bf_neg(r); + bf_add(r, r, U, prec1, BF_RNDN); + bf_mul(r, r, X2, prec1, BF_RNDN); + } + bf_neg(r); + bf_add_si(r, r, 1, prec1, BF_RNDN); + bf_mul(r, r, T, prec1, BF_RNDN); + + /* undo the argument reduction */ + bf_mul_2exp(r, K, BF_PREC_INF, BF_RNDZ); + + bf_delete(U); + bf_delete(V); + bf_delete(X2); + + i = add_pi2; + if (cmp_1 > 0) { + /* undo the inversion : r = sign(a)*PI/2 - r */ + bf_neg(r); + i += 1 - 2 * a->sign; + } + /* add i*(pi/2) with -1 <= i <= 2 */ + if (i != 0) { + bf_const_pi(T, prec1, BF_RNDF); + if (i != 2) + bf_mul_2exp(T, -1, BF_PREC_INF, BF_RNDZ); + T->sign = (i < 0); + bf_add(r, T, r, prec1, BF_RNDN); + } + + bf_delete(T); + return BF_ST_INEXACT; +} + +int bf_atan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + int res; + + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF) { + /* -PI/2 or PI/2 */ + bf_const_pi_signed(r, a->sign, prec, flags); + bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ); + return BF_ST_INEXACT; + } else { + bf_set_zero(r, a->sign); + return 0; + } + } + + bf_init(s, T); + bf_set_ui(T, 1); + res = bf_cmpu(a, T); + bf_delete(T); + if (res == 0) { + /* short cut: abs(a) == 1 -> +/-pi/4 */ + bf_const_pi_signed(r, a->sign, prec, flags); + bf_mul_2exp(r, -2, BF_PREC_INF, BF_RNDZ); + return BF_ST_INEXACT; + } + + /* small argument case: result = x+r(x) with r(x) = -x^3/3 + + O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */ + if (a->expn < 0) { + slimb_t e; + e = sat_add(2 * a->expn, a->expn - 1); + if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { + bf_set(r, a); + return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags); + } + } + + return bf_ziv_rounding(r, a, prec, flags, bf_atan_internal, (void *)FALSE); +} + +static int bf_atan2_internal(bf_t *r, const bf_t *y, limb_t prec, void *opaque) +{ + bf_context_t *s = r->ctx; + const bf_t *x = opaque; + bf_t T_s, *T = &T_s; + limb_t prec1; + int ret; + + if (y->expn == BF_EXP_NAN || x->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } + + /* compute atan(y/x) assumming inf/inf = 1 and 0/0 = 0 */ + bf_init(s, T); + prec1 = prec + 32; + if (y->expn == BF_EXP_INF && x->expn == BF_EXP_INF) { + bf_set_ui(T, 1); + T->sign = y->sign ^ x->sign; + } else if (y->expn == BF_EXP_ZERO && x->expn == BF_EXP_ZERO) { + bf_set_zero(T, y->sign ^ x->sign); + } else { + bf_div(T, y, x, prec1, BF_RNDF); + } + ret = bf_atan(r, T, prec1, BF_RNDF); + + if (x->sign) { + /* if x < 0 (it includes -0), return sign(y)*pi + atan(y/x) */ + bf_const_pi(T, prec1, BF_RNDF); + T->sign = y->sign; + bf_add(r, r, T, prec1, BF_RNDN); + ret |= BF_ST_INEXACT; + } + + bf_delete(T); + return ret; +} + +int bf_atan2(bf_t *r, const bf_t *y, const bf_t *x, + limb_t prec, bf_flags_t flags) +{ + return bf_ziv_rounding(r, y, prec, flags, bf_atan2_internal, (void *)x); +} + +static int bf_asin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) +{ + bf_context_t *s = r->ctx; + BOOL is_acos = (BOOL)(intptr_t)opaque; + bf_t T_s, *T = &T_s; + limb_t prec1, prec2; + + /* asin(x) = atan(x/sqrt(1-x^2)) + acos(x) = pi/2 - asin(x) */ + prec1 = prec + 8; + /* increase the precision in x^2 to compensate the cancellation in + (1-x^2) if x is close to 1 */ + /* XXX: use less precision when possible */ + if (a->expn >= 0) + prec2 = BF_PREC_INF; + else + prec2 = prec1; + bf_init(s, T); + bf_mul(T, a, a, prec2, BF_RNDN); + bf_neg(T); + bf_add_si(T, T, 1, prec2, BF_RNDN); + + bf_sqrt(r, T, prec1, BF_RNDN); + bf_div(T, a, r, prec1, BF_RNDN); + if (is_acos) + bf_neg(T); + bf_atan_internal(r, T, prec1, (void *)(intptr_t)is_acos); + bf_delete(T); + return BF_ST_INEXACT; +} + +int bf_asin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + int res; + + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_set_zero(r, a->sign); + return 0; + } + } + bf_init(s, T); + bf_set_ui(T, 1); + res = bf_cmpu(a, T); + bf_delete(T); + if (res > 0) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } + + /* small argument case: result = x+r(x) with r(x) = x^3/6 + + O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */ + if (a->expn < 0) { + slimb_t e; + e = sat_add(2 * a->expn, a->expn - 2); + if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { + bf_set(r, a); + return bf_add_epsilon(r, r, e, a->sign, prec, flags); + } + } + + return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)FALSE); +} + +int bf_acos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = r->ctx; + bf_t T_s, *T = &T_s; + int res; + + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bf_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bf_const_pi(r, prec, flags); + bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ); + return BF_ST_INEXACT; + } + } + bf_init(s, T); + bf_set_ui(T, 1); + res = bf_cmpu(a, T); + bf_delete(T); + if (res > 0) { + bf_set_nan(r); + return BF_ST_INVALID_OP; + } else if (res == 0 && a->sign == 0) { + bf_set_zero(r, 0); + return 0; + } + + return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)TRUE); +} + +/***************************************************************/ +/* decimal floating point numbers */ + +#ifdef USE_BF_DEC + +#define adddq(r1, r0, a1, a0) \ + do { \ + limb_t __t = r0; \ + r0 += (a0); \ + r1 += (a1) + (r0 < __t); \ + } while (0) + +#define subdq(r1, r0, a1, a0) \ + do { \ + limb_t __t = r0; \ + r0 -= (a0); \ + r1 -= (a1) + (r0 > __t); \ + } while (0) + +#if LIMB_BITS == 64 + +/* Note: we assume __int128 is available */ +#define muldq(r1, r0, a, b) \ + do { \ + unsigned __int128 __t; \ + __t = (unsigned __int128)(a) * (unsigned __int128)(b); \ + r0 = __t; \ + r1 = __t >> 64; \ + } while (0) + +#define divdq(q, r, a1, a0, b) \ + do { \ + unsigned __int128 __t; \ + limb_t __b = (b); \ + __t = ((unsigned __int128)(a1) << 64) | (a0); \ + q = __t / __b; \ + r = __t % __b; \ + } while (0) + +#else + +#define muldq(r1, r0, a, b) \ + do { \ + uint64_t __t; \ + __t = (uint64_t)(a) * (uint64_t)(b); \ + r0 = __t; \ + r1 = __t >> 32; \ + } while (0) + +#define divdq(q, r, a1, a0, b) \ + do { \ + uint64_t __t; \ + limb_t __b = (b); \ + __t = ((uint64_t)(a1) << 32) | (a0); \ + q = __t / __b; \ + r = __t % __b; \ + } while (0) + +#endif /* LIMB_BITS != 64 */ + +static inline __maybe_unused limb_t shrd(limb_t low, limb_t high, long shift) +{ + if (shift != 0) + low = (low >> shift) | (high << (LIMB_BITS - shift)); + return low; +} + +static inline __maybe_unused limb_t shld(limb_t a1, limb_t a0, long shift) +{ + if (shift != 0) + return (a1 << shift) | (a0 >> (LIMB_BITS - shift)); + else + return a1; +} + +#if LIMB_DIGITS == 19 + +/* WARNING: hardcoded for b = 1e19. It is assumed that: + 0 <= a1 < 2^63 */ +#define divdq_base(q, r, a1, a0)\ +do {\ + uint64_t __a0, __a1, __t0, __t1, __b = BF_DEC_BASE; \ + __a0 = a0;\ + __a1 = a1;\ + __t0 = __a1;\ + __t0 = shld(__t0, __a0, 1);\ + muldq(q, __t1, __t0, UINT64_C(17014118346046923173)); \ + muldq(__t1, __t0, q, __b);\ + subdq(__a1, __a0, __t1, __t0);\ + subdq(__a1, __a0, 1, __b * 2); \ + __t0 = (slimb_t)__a1 >> 1; \ + q += 2 + __t0;\ + adddq(__a1, __a0, 0, __b & __t0);\ + q += __a1; \ + __a0 += __b & __a1; \ + r = __a0;\ +} while(0) + +#elif LIMB_DIGITS == 9 + +/* WARNING: hardcoded for b = 1e9. It is assumed that: + 0 <= a1 < 2^29 */ +#define divdq_base(q, r, a1, a0)\ +do {\ + uint32_t __t0, __t1, __b = BF_DEC_BASE; \ + __t0 = a1;\ + __t1 = a0;\ + __t0 = (__t0 << 3) | (__t1 >> (32 - 3)); \ + muldq(q, __t1, __t0, 2305843009U);\ + r = a0 - q * __b;\ + __t1 = (r >= __b);\ + q += __t1;\ + if (__t1)\ + r -= __b;\ +} while(0) + +#endif + +/* fast integer division by a fixed constant */ + +typedef struct FastDivData { + limb_t m1; /* multiplier */ + int8_t shift1; + int8_t shift2; +} FastDivData; + +/* From "Division by Invariant Integers using Multiplication" by + Torborn Granlund and Peter L. Montgomery */ +/* d must be != 0 */ +static inline __maybe_unused void fast_udiv_init(FastDivData *s, limb_t d) +{ + int l; + limb_t q, r, m1; + if (d == 1) + l = 0; + else + l = 64 - clz64(d - 1); + divdq(q, r, ((limb_t)1 << l) - d, 0, d); + (void)r; + m1 = q + 1; + // printf("d=%lu l=%d m1=0x%016lx\n", d, l, m1); + s->m1 = m1; + s->shift1 = l; + if (s->shift1 > 1) + s->shift1 = 1; + s->shift2 = l - 1; + if (s->shift2 < 0) + s->shift2 = 0; +} + +static inline limb_t fast_udiv(limb_t a, const FastDivData *s) +{ + limb_t t0, t1; + muldq(t1, t0, s->m1, a); + t0 = (a - t1) >> s->shift1; + return (t1 + t0) >> s->shift2; +} + +/* contains 10^i */ +const limb_t mp_pow_dec[LIMB_DIGITS + 1] = { + 1U, + 10U, + 100U, + 1000U, + 10000U, + 100000U, + 1000000U, + 10000000U, + 100000000U, + 1000000000U, +#if LIMB_BITS == 64 + 10000000000U, + 100000000000U, + 1000000000000U, + 10000000000000U, + 100000000000000U, + 1000000000000000U, + 10000000000000000U, + 100000000000000000U, + 1000000000000000000U, + 10000000000000000000U, +#endif +}; + +/* precomputed from fast_udiv_init(10^i) */ +static const FastDivData mp_pow_div[LIMB_DIGITS + 1] = { +#if LIMB_BITS == 32 + { 0x00000001, 0, 0 }, + { 0x9999999a, 1, 3 }, + { 0x47ae147b, 1, 6 }, + { 0x0624dd30, 1, 9 }, + { 0xa36e2eb2, 1, 13 }, + { 0x4f8b588f, 1, 16 }, + { 0x0c6f7a0c, 1, 19 }, + { 0xad7f29ac, 1, 23 }, + { 0x5798ee24, 1, 26 }, + { 0x12e0be83, 1, 29 }, +#else + { 0x0000000000000001, 0, 0 }, + { 0x999999999999999a, 1, 3 }, + { 0x47ae147ae147ae15, 1, 6 }, + { 0x0624dd2f1a9fbe77, 1, 9 }, + { 0xa36e2eb1c432ca58, 1, 13 }, + { 0x4f8b588e368f0847, 1, 16 }, + { 0x0c6f7a0b5ed8d36c, 1, 19 }, + { 0xad7f29abcaf48579, 1, 23 }, + { 0x5798ee2308c39dfa, 1, 26 }, + { 0x12e0be826d694b2f, 1, 29 }, + { 0xb7cdfd9d7bdbab7e, 1, 33 }, + { 0x5fd7fe17964955fe, 1, 36 }, + { 0x19799812dea11198, 1, 39 }, + { 0xc25c268497681c27, 1, 43 }, + { 0x6849b86a12b9b01f, 1, 46 }, + { 0x203af9ee756159b3, 1, 49 }, + { 0xcd2b297d889bc2b7, 1, 53 }, + { 0x70ef54646d496893, 1, 56 }, + { 0x2725dd1d243aba0f, 1, 59 }, + { 0xd83c94fb6d2ac34d, 1, 63 }, +#endif +}; + +/* divide by 10^shift with 0 <= shift <= LIMB_DIGITS */ +static inline limb_t fast_shr_dec(limb_t a, int shift) +{ + return fast_udiv(a, &mp_pow_div[shift]); +} + +/* division and remainder by 10^shift */ +#define fast_shr_rem_dec(q, r, a, shift) q = fast_shr_dec(a, shift), r = a - q * mp_pow_dec[shift] + +limb_t mp_add_dec(limb_t *res, const limb_t *op1, const limb_t *op2, + mp_size_t n, limb_t carry) +{ + limb_t base = BF_DEC_BASE; + mp_size_t i; + limb_t k, a, v; + + k=carry; + for(i=0;i<n;i++) { + /* XXX: reuse the trick in add_mod */ + v = op1[i]; + a = v + op2[i] + k - base; + k = a <= v; + if (!k) + a += base; + res[i]=a; + } + return k; +} + +limb_t mp_add_ui_dec(limb_t *tab, limb_t b, mp_size_t n) +{ + limb_t base = BF_DEC_BASE; + mp_size_t i; + limb_t k, a, v; + + k=b; + for(i=0;i<n;i++) { + v = tab[i]; + a = v + k - base; + k = a <= v; + if (!k) + a += base; + tab[i] = a; + if (k == 0) + break; + } + return k; +} + +limb_t mp_sub_dec(limb_t *res, const limb_t *op1, const limb_t *op2, + mp_size_t n, limb_t carry) +{ + limb_t base = BF_DEC_BASE; + mp_size_t i; + limb_t k, v, a; + + k=carry; + for(i=0;i<n;i++) { + v = op1[i]; + a = v - op2[i] - k; + k = a > v; + if (k) + a += base; + res[i] = a; + } + return k; +} + +limb_t mp_sub_ui_dec(limb_t *tab, limb_t b, mp_size_t n) +{ + limb_t base = BF_DEC_BASE; + mp_size_t i; + limb_t k, v, a; + + k=b; + for(i=0;i<n;i++) { + v = tab[i]; + a = v - k; + k = a > v; + if (k) + a += base; + tab[i]=a; + if (k == 0) + break; + } + return k; +} + +/* taba[] = taba[] * b + l. 0 <= b, l <= base - 1. Return the high carry */ +limb_t mp_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n, + limb_t b, limb_t l) +{ + mp_size_t i; + limb_t t0, t1, r; + + for(i = 0; i < n; i++) { + muldq(t1, t0, taba[i], b); + adddq(t1, t0, 0, l); + divdq_base(l, r, t1, t0); + tabr[i] = r; + } + return l; +} + +/* tabr[] += taba[] * b. 0 <= b <= base - 1. Return the value to add + to the high word */ +limb_t mp_add_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n, + limb_t b) +{ + mp_size_t i; + limb_t l, t0, t1, r; + + l = 0; + for(i = 0; i < n; i++) { + muldq(t1, t0, taba[i], b); + adddq(t1, t0, 0, l); + adddq(t1, t0, 0, tabr[i]); + divdq_base(l, r, t1, t0); + tabr[i] = r; + } + return l; +} + +/* tabr[] -= taba[] * b. 0 <= b <= base - 1. Return the value to + substract to the high word. */ +limb_t mp_sub_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n, + limb_t b) +{ + limb_t base = BF_DEC_BASE; + mp_size_t i; + limb_t l, t0, t1, r, a, v, c; + + /* XXX: optimize */ + l = 0; + for(i = 0; i < n; i++) { + muldq(t1, t0, taba[i], b); + adddq(t1, t0, 0, l); + divdq_base(l, r, t1, t0); + v = tabr[i]; + a = v - r; + c = a > v; + if (c) + a += base; + /* never bigger than base because r = 0 when l = base - 1 */ + l += c; + tabr[i] = a; + } + return l; +} + +/* size of the result : op1_size + op2_size. */ +void mp_mul_basecase_dec(limb_t *result, + const limb_t *op1, mp_size_t op1_size, + const limb_t *op2, mp_size_t op2_size) +{ + mp_size_t i; + limb_t r; + + result[op1_size] = mp_mul1_dec(result, op1, op1_size, op2[0], 0); + + for(i=1;i<op2_size;i++) { + r = mp_add_mul1_dec(result + i, op1, op1_size, op2[i]); + result[i + op1_size] = r; + } +} + +/* taba[] = (taba[] + r*base^na) / b. 0 <= b < base. 0 <= r < + b. Return the remainder. */ +limb_t mp_div1_dec(limb_t *tabr, const limb_t *taba, mp_size_t na, + limb_t b, limb_t r) +{ + limb_t base = BF_DEC_BASE; + mp_size_t i; + limb_t t0, t1, q; + int shift; + +#if (BF_DEC_BASE % 2) == 0 + if (b == 2) { + limb_t base_div2; + /* Note: only works if base is even */ + base_div2 = base >> 1; + if (r) + r = base_div2; + for(i = na - 1; i >= 0; i--) { + t0 = taba[i]; + tabr[i] = (t0 >> 1) + r; + r = 0; + if (t0 & 1) + r = base_div2; + } + if (r) + r = 1; + } else +#endif + if (na >= UDIV1NORM_THRESHOLD) { + shift = clz(b); + if (shift == 0) { + /* normalized case: b >= 2^(LIMB_BITS-1) */ + limb_t b_inv; + b_inv = udiv1norm_init(b); + for(i = na - 1; i >= 0; i--) { + muldq(t1, t0, r, base); + adddq(t1, t0, 0, taba[i]); + q = udiv1norm(&r, t1, t0, b, b_inv); + tabr[i] = q; + } + } else { + limb_t b_inv; + b <<= shift; + b_inv = udiv1norm_init(b); + for(i = na - 1; i >= 0; i--) { + muldq(t1, t0, r, base); + adddq(t1, t0, 0, taba[i]); + t1 = (t1 << shift) | (t0 >> (LIMB_BITS - shift)); + t0 <<= shift; + q = udiv1norm(&r, t1, t0, b, b_inv); + r >>= shift; + tabr[i] = q; + } + } + } else { + for(i = na - 1; i >= 0; i--) { + muldq(t1, t0, r, base); + adddq(t1, t0, 0, taba[i]); + divdq(q, r, t1, t0, b); + tabr[i] = q; + } + } + return r; +} + +static __maybe_unused void mp_print_str_dec(const char *str, + const limb_t *tab, slimb_t n) +{ + slimb_t i; + printf("%s=", str); + for(i = n - 1; i >= 0; i--) { + if (i != n - 1) + printf("_"); + printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]); + } + printf("\n"); +} + +static __maybe_unused void mp_print_str_h_dec(const char *str, + const limb_t *tab, slimb_t n, + limb_t high) +{ + slimb_t i; + printf("%s=", str); + printf("%0*" PRIu_LIMB, LIMB_DIGITS, high); + for(i = n - 1; i >= 0; i--) { + printf("_"); + printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]); + } + printf("\n"); +} + +//#define DEBUG_DIV_SLOW + +#define DIV_STATIC_ALLOC_LEN 16 + +/* return q = a / b and r = a % b. + + taba[na] must be allocated if tabb1[nb - 1] < B / 2. tabb1[nb - 1] + must be != zero. na must be >= nb. 's' can be NULL if tabb1[nb - 1] + >= B / 2. + + The remainder is is returned in taba and contains nb libms. tabq + contains na - nb + 1 limbs. No overlap is permitted. + + Running time of the standard method: (na - nb + 1) * nb + Return 0 if OK, -1 if memory alloc error +*/ +/* XXX: optimize */ +static int mp_div_dec(bf_context_t *s, limb_t *tabq, + limb_t *taba, mp_size_t na, + const limb_t *tabb1, mp_size_t nb) +{ + limb_t base = BF_DEC_BASE; + limb_t r, mult, t0, t1, a, c, q, v, *tabb; + mp_size_t i, j; + limb_t static_tabb[DIV_STATIC_ALLOC_LEN]; + +#ifdef DEBUG_DIV_SLOW + mp_print_str_dec("a", taba, na); + mp_print_str_dec("b", tabb1, nb); +#endif + + /* normalize tabb */ + r = tabb1[nb - 1]; + assert(r != 0); + i = na - nb; + if (r >= BF_DEC_BASE / 2) { + mult = 1; + tabb = (limb_t *)tabb1; + q = 1; + for(j = nb - 1; j >= 0; j--) { + if (taba[i + j] != tabb[j]) { + if (taba[i + j] < tabb[j]) + q = 0; + break; + } + } + tabq[i] = q; + if (q) { + mp_sub_dec(taba + i, taba + i, tabb, nb, 0); + } + i--; + } else { + mult = base / (r + 1); + if (likely(nb <= DIV_STATIC_ALLOC_LEN)) { + tabb = static_tabb; + } else { + tabb = bf_malloc(s, sizeof(limb_t) * nb); + if (!tabb) + return -1; + } + mp_mul1_dec(tabb, tabb1, nb, mult, 0); + taba[na] = mp_mul1_dec(taba, taba, na, mult, 0); + } + +#ifdef DEBUG_DIV_SLOW + printf("mult=" FMT_LIMB "\n", mult); + mp_print_str_dec("a_norm", taba, na + 1); + mp_print_str_dec("b_norm", tabb, nb); +#endif + + for(; i >= 0; i--) { + if (unlikely(taba[i + nb] >= tabb[nb - 1])) { + /* XXX: check if it is really possible */ + q = base - 1; + } else { + muldq(t1, t0, taba[i + nb], base); + adddq(t1, t0, 0, taba[i + nb - 1]); + divdq(q, r, t1, t0, tabb[nb - 1]); + } + // printf("i=%d q1=%ld\n", i, q); + + r = mp_sub_mul1_dec(taba + i, tabb, nb, q); + // mp_dump("r1", taba + i, nb, bd); + // printf("r2=%ld\n", r); + + v = taba[i + nb]; + a = v - r; + c = a > v; + if (c) + a += base; + taba[i + nb] = a; + + if (c != 0) { + /* negative result */ + for(;;) { + q--; + c = mp_add_dec(taba + i, taba + i, tabb, nb, 0); + /* propagate carry and test if positive result */ + if (c != 0) { + if (++taba[i + nb] == base) { + break; + } + } + } + } + tabq[i] = q; + } + +#ifdef DEBUG_DIV_SLOW + mp_print_str_dec("q", tabq, na - nb + 1); + mp_print_str_dec("r", taba, nb); +#endif + + /* remove the normalization */ + if (mult != 1) { + mp_div1_dec(taba, taba, nb, mult, 0); + if (unlikely(tabb != static_tabb)) + bf_free(s, tabb); + } + return 0; +} + +/* divide by 10^shift */ +static limb_t mp_shr_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n, + limb_t shift, limb_t high) +{ + mp_size_t i; + limb_t l, a, q, r; + + assert(shift >= 1 && shift < LIMB_DIGITS); + l = high; + for(i = n - 1; i >= 0; i--) { + a = tab[i]; + fast_shr_rem_dec(q, r, a, shift); + tab_r[i] = q + l * mp_pow_dec[LIMB_DIGITS - shift]; + l = r; + } + return l; +} + +/* multiply by 10^shift */ +static limb_t mp_shl_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n, + limb_t shift, limb_t low) +{ + mp_size_t i; + limb_t l, a, q, r; + + assert(shift >= 1 && shift < LIMB_DIGITS); + l = low; + for(i = 0; i < n; i++) { + a = tab[i]; + fast_shr_rem_dec(q, r, a, LIMB_DIGITS - shift); + tab_r[i] = r * mp_pow_dec[shift] + l; + l = q; + } + return l; +} + +static limb_t mp_sqrtrem2_dec(limb_t *tabs, limb_t *taba) +{ + int k; + dlimb_t a, b, r; + limb_t taba1[2], s, r0, r1; + + /* convert to binary and normalize */ + a = (dlimb_t)taba[1] * BF_DEC_BASE + taba[0]; + k = clz(a >> LIMB_BITS) & ~1; + b = a << k; + taba1[0] = b; + taba1[1] = b >> LIMB_BITS; + mp_sqrtrem2(&s, taba1); + s >>= (k >> 1); + /* convert the remainder back to decimal */ + r = a - (dlimb_t)s * (dlimb_t)s; + divdq_base(r1, r0, r >> LIMB_BITS, r); + taba[0] = r0; + tabs[0] = s; + return r1; +} + +//#define DEBUG_SQRTREM_DEC + +/* tmp_buf must contain (n / 2 + 1 limbs) */ +static limb_t mp_sqrtrem_rec_dec(limb_t *tabs, limb_t *taba, limb_t n, + limb_t *tmp_buf) +{ + limb_t l, h, rh, ql, qh, c, i; + + if (n == 1) + return mp_sqrtrem2_dec(tabs, taba); +#ifdef DEBUG_SQRTREM_DEC + mp_print_str_dec("a", taba, 2 * n); +#endif + l = n / 2; + h = n - l; + qh = mp_sqrtrem_rec_dec(tabs + l, taba + 2 * l, h, tmp_buf); +#ifdef DEBUG_SQRTREM_DEC + mp_print_str_dec("s1", tabs + l, h); + mp_print_str_h_dec("r1", taba + 2 * l, h, qh); + mp_print_str_h_dec("r2", taba + l, n, qh); +#endif + + /* the remainder is in taba + 2 * l. Its high bit is in qh */ + if (qh) { + mp_sub_dec(taba + 2 * l, taba + 2 * l, tabs + l, h, 0); + } + /* instead of dividing by 2*s, divide by s (which is normalized) + and update q and r */ + mp_div_dec(NULL, tmp_buf, taba + l, n, tabs + l, h); + qh += tmp_buf[l]; + for(i = 0; i < l; i++) + tabs[i] = tmp_buf[i]; + ql = mp_div1_dec(tabs, tabs, l, 2, qh & 1); + qh = qh >> 1; /* 0 or 1 */ + if (ql) + rh = mp_add_dec(taba + l, taba + l, tabs + l, h, 0); + else + rh = 0; +#ifdef DEBUG_SQRTREM_DEC + mp_print_str_h_dec("q", tabs, l, qh); + mp_print_str_h_dec("u", taba + l, h, rh); +#endif + + mp_add_ui_dec(tabs + l, qh, h); +#ifdef DEBUG_SQRTREM_DEC + mp_print_str_dec("s2", tabs, n); +#endif + + /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */ + /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */ + if (qh) { + c = qh; + } else { + mp_mul_basecase_dec(taba + n, tabs, l, tabs, l); + c = mp_sub_dec(taba, taba, taba + n, 2 * l, 0); + } + rh -= mp_sub_ui_dec(taba + 2 * l, c, n - 2 * l); + if ((slimb_t)rh < 0) { + mp_sub_ui_dec(tabs, 1, n); + rh += mp_add_mul1_dec(taba, tabs, n, 2); + rh += mp_add_ui_dec(taba, 1, n); + } + return rh; +} + +/* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= B/4. Return (s, + r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2 * s. tabs has n + limbs. r is returned in the lower n limbs of taba. Its r[n] is the + returned value of the function. */ +int mp_sqrtrem_dec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n) +{ + limb_t tmp_buf1[8]; + limb_t *tmp_buf; + mp_size_t n2; + n2 = n / 2 + 1; + if (n2 <= countof(tmp_buf1)) { + tmp_buf = tmp_buf1; + } else { + tmp_buf = bf_malloc(s, sizeof(limb_t) * n2); + if (!tmp_buf) + return -1; + } + taba[n] = mp_sqrtrem_rec_dec(tabs, taba, n, tmp_buf); + if (tmp_buf != tmp_buf1) + bf_free(s, tmp_buf); + return 0; +} + +/* return the number of leading zero digits, from 0 to LIMB_DIGITS */ +static int clz_dec(limb_t a) +{ + if (a == 0) + return LIMB_DIGITS; + switch(LIMB_BITS - 1 - clz(a)) { + case 0: /* 1-1 */ + return LIMB_DIGITS - 1; + case 1: /* 2-3 */ + return LIMB_DIGITS - 1; + case 2: /* 4-7 */ + return LIMB_DIGITS - 1; + case 3: /* 8-15 */ + if (a < 10) + return LIMB_DIGITS - 1; + else + return LIMB_DIGITS - 2; + case 4: /* 16-31 */ + return LIMB_DIGITS - 2; + case 5: /* 32-63 */ + return LIMB_DIGITS - 2; + case 6: /* 64-127 */ + if (a < 100) + return LIMB_DIGITS - 2; + else + return LIMB_DIGITS - 3; + case 7: /* 128-255 */ + return LIMB_DIGITS - 3; + case 8: /* 256-511 */ + return LIMB_DIGITS - 3; + case 9: /* 512-1023 */ + if (a < 1000) + return LIMB_DIGITS - 3; + else + return LIMB_DIGITS - 4; + case 10: /* 1024-2047 */ + return LIMB_DIGITS - 4; + case 11: /* 2048-4095 */ + return LIMB_DIGITS - 4; + case 12: /* 4096-8191 */ + return LIMB_DIGITS - 4; + case 13: /* 8192-16383 */ + if (a < 10000) + return LIMB_DIGITS - 4; + else + return LIMB_DIGITS - 5; + case 14: /* 16384-32767 */ + return LIMB_DIGITS - 5; + case 15: /* 32768-65535 */ + return LIMB_DIGITS - 5; + case 16: /* 65536-131071 */ + if (a < 100000) + return LIMB_DIGITS - 5; + else + return LIMB_DIGITS - 6; + case 17: /* 131072-262143 */ + return LIMB_DIGITS - 6; + case 18: /* 262144-524287 */ + return LIMB_DIGITS - 6; + case 19: /* 524288-1048575 */ + if (a < 1000000) + return LIMB_DIGITS - 6; + else + return LIMB_DIGITS - 7; + case 20: /* 1048576-2097151 */ + return LIMB_DIGITS - 7; + case 21: /* 2097152-4194303 */ + return LIMB_DIGITS - 7; + case 22: /* 4194304-8388607 */ + return LIMB_DIGITS - 7; + case 23: /* 8388608-16777215 */ + if (a < 10000000) + return LIMB_DIGITS - 7; + else + return LIMB_DIGITS - 8; + case 24: /* 16777216-33554431 */ + return LIMB_DIGITS - 8; + case 25: /* 33554432-67108863 */ + return LIMB_DIGITS - 8; + case 26: /* 67108864-134217727 */ + if (a < 100000000) + return LIMB_DIGITS - 8; + else + return LIMB_DIGITS - 9; +#if LIMB_BITS == 64 + case 27: /* 134217728-268435455 */ + return LIMB_DIGITS - 9; + case 28: /* 268435456-536870911 */ + return LIMB_DIGITS - 9; + case 29: /* 536870912-1073741823 */ + if (a < 1000000000) + return LIMB_DIGITS - 9; + else + return LIMB_DIGITS - 10; + case 30: /* 1073741824-2147483647 */ + return LIMB_DIGITS - 10; + case 31: /* 2147483648-4294967295 */ + return LIMB_DIGITS - 10; + case 32: /* 4294967296-8589934591 */ + return LIMB_DIGITS - 10; + case 33: /* 8589934592-17179869183 */ + if (a < 10000000000) + return LIMB_DIGITS - 10; + else + return LIMB_DIGITS - 11; + case 34: /* 17179869184-34359738367 */ + return LIMB_DIGITS - 11; + case 35: /* 34359738368-68719476735 */ + return LIMB_DIGITS - 11; + case 36: /* 68719476736-137438953471 */ + if (a < 100000000000) + return LIMB_DIGITS - 11; + else + return LIMB_DIGITS - 12; + case 37: /* 137438953472-274877906943 */ + return LIMB_DIGITS - 12; + case 38: /* 274877906944-549755813887 */ + return LIMB_DIGITS - 12; + case 39: /* 549755813888-1099511627775 */ + if (a < 1000000000000) + return LIMB_DIGITS - 12; + else + return LIMB_DIGITS - 13; + case 40: /* 1099511627776-2199023255551 */ + return LIMB_DIGITS - 13; + case 41: /* 2199023255552-4398046511103 */ + return LIMB_DIGITS - 13; + case 42: /* 4398046511104-8796093022207 */ + return LIMB_DIGITS - 13; + case 43: /* 8796093022208-17592186044415 */ + if (a < 10000000000000) + return LIMB_DIGITS - 13; + else + return LIMB_DIGITS - 14; + case 44: /* 17592186044416-35184372088831 */ + return LIMB_DIGITS - 14; + case 45: /* 35184372088832-70368744177663 */ + return LIMB_DIGITS - 14; + case 46: /* 70368744177664-140737488355327 */ + if (a < 100000000000000) + return LIMB_DIGITS - 14; + else + return LIMB_DIGITS - 15; + case 47: /* 140737488355328-281474976710655 */ + return LIMB_DIGITS - 15; + case 48: /* 281474976710656-562949953421311 */ + return LIMB_DIGITS - 15; + case 49: /* 562949953421312-1125899906842623 */ + if (a < 1000000000000000) + return LIMB_DIGITS - 15; + else + return LIMB_DIGITS - 16; + case 50: /* 1125899906842624-2251799813685247 */ + return LIMB_DIGITS - 16; + case 51: /* 2251799813685248-4503599627370495 */ + return LIMB_DIGITS - 16; + case 52: /* 4503599627370496-9007199254740991 */ + return LIMB_DIGITS - 16; + case 53: /* 9007199254740992-18014398509481983 */ + if (a < 10000000000000000) + return LIMB_DIGITS - 16; + else + return LIMB_DIGITS - 17; + case 54: /* 18014398509481984-36028797018963967 */ + return LIMB_DIGITS - 17; + case 55: /* 36028797018963968-72057594037927935 */ + return LIMB_DIGITS - 17; + case 56: /* 72057594037927936-144115188075855871 */ + if (a < 100000000000000000) + return LIMB_DIGITS - 17; + else + return LIMB_DIGITS - 18; + case 57: /* 144115188075855872-288230376151711743 */ + return LIMB_DIGITS - 18; + case 58: /* 288230376151711744-576460752303423487 */ + return LIMB_DIGITS - 18; + case 59: /* 576460752303423488-1152921504606846975 */ + if (a < 1000000000000000000) + return LIMB_DIGITS - 18; + else + return LIMB_DIGITS - 19; +#endif + default: + return 0; + } +} + +/* for debugging */ +void bfdec_print_str(const char *str, const bfdec_t *a) +{ + slimb_t i; + printf("%s=", str); + + if (a->expn == BF_EXP_NAN) { + printf("NaN"); + } else { + if (a->sign) + putchar('-'); + if (a->expn == BF_EXP_ZERO) { + putchar('0'); + } else if (a->expn == BF_EXP_INF) { + printf("Inf"); + } else { + printf("0."); + for(i = a->len - 1; i >= 0; i--) + printf("%0*" PRIu_LIMB, LIMB_DIGITS, a->tab[i]); + printf("e%" PRId_LIMB, a->expn); + } + } + printf("\n"); +} + +/* return != 0 if one digit between 0 and bit_pos inclusive is not zero. */ +static inline limb_t scan_digit_nz(const bfdec_t *r, slimb_t bit_pos) +{ + slimb_t pos; + limb_t v, q; + int shift; + + if (bit_pos < 0) + return 0; + pos = (limb_t)bit_pos / LIMB_DIGITS; + shift = (limb_t)bit_pos % LIMB_DIGITS; + fast_shr_rem_dec(q, v, r->tab[pos], shift + 1); + (void)q; + if (v != 0) + return 1; + pos--; + while (pos >= 0) { + if (r->tab[pos] != 0) + return 1; + pos--; + } + return 0; +} + +static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos) +{ + slimb_t i; + int shift; + i = floor_div(pos, LIMB_DIGITS); + if (i < 0 || i >= len) + return 0; + shift = pos - i * LIMB_DIGITS; + return fast_shr_dec(tab[i], shift) % 10; +} + +#if 0 +static limb_t get_digits(const limb_t *tab, limb_t len, slimb_t pos) +{ + limb_t a0, a1; + int shift; + slimb_t i; + + i = floor_div(pos, LIMB_DIGITS); + shift = pos - i * LIMB_DIGITS; + if (i >= 0 && i < len) + a0 = tab[i]; + else + a0 = 0; + if (shift == 0) { + return a0; + } else { + i++; + if (i >= 0 && i < len) + a1 = tab[i]; + else + a1 = 0; + return fast_shr_dec(a0, shift) + + fast_urem(a1, &mp_pow_div[LIMB_DIGITS - shift]) * + mp_pow_dec[shift]; + } +} +#endif + +/* return the addend for rounding. Note that prec can be <= 0 for bf_rint() */ +static int bfdec_get_rnd_add(int *pret, const bfdec_t *r, limb_t l, + slimb_t prec, int rnd_mode) +{ + int add_one, inexact; + limb_t digit1, digit0; + + // bfdec_print_str("get_rnd_add", r); + if (rnd_mode == BF_RNDF) { + digit0 = 1; /* faithful rounding does not honor the INEXACT flag */ + } else { + /* starting limb for bit 'prec + 1' */ + digit0 = scan_digit_nz(r, l * LIMB_DIGITS - 1 - bf_max(0, prec + 1)); + } + + /* get the digit at 'prec' */ + digit1 = get_digit(r->tab, l, l * LIMB_DIGITS - 1 - prec); + inexact = (digit1 | digit0) != 0; + + add_one = 0; + switch(rnd_mode) { + case BF_RNDZ: + break; + case BF_RNDN: + if (digit1 == 5) { + if (digit0) { + add_one = 1; + } else { + /* round to even */ + add_one = + get_digit(r->tab, l, l * LIMB_DIGITS - 1 - (prec - 1)) & 1; + } + } else if (digit1 > 5) { + add_one = 1; + } + break; + case BF_RNDD: + case BF_RNDU: + if (r->sign == (rnd_mode == BF_RNDD)) + add_one = inexact; + break; + case BF_RNDNA: + case BF_RNDF: + add_one = (digit1 >= 5); + break; + case BF_RNDA: + add_one = inexact; + break; + default: + abort(); + } + + if (inexact) + *pret |= BF_ST_INEXACT; + return add_one; +} + +/* round to prec1 bits assuming 'r' is non zero and finite. 'r' is + assumed to have length 'l' (1 <= l <= r->len). prec1 can be + BF_PREC_INF. BF_FLAG_SUBNORMAL is not supported. Cannot fail with + BF_ST_MEM_ERROR. + */ +static int __bfdec_round(bfdec_t *r, limb_t prec1, bf_flags_t flags, limb_t l) +{ + int shift, add_one, rnd_mode, ret; + slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec; + + /* XXX: align to IEEE 754 2008 for decimal numbers ? */ + e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1); + e_min = -e_range + 3; + e_max = e_range; + + if (flags & BF_FLAG_RADPNT_PREC) { + /* 'prec' is the precision after the decimal point */ + if (prec1 != BF_PREC_INF) + prec = r->expn + prec1; + else + prec = prec1; + } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) { + /* restrict the precision in case of potentially subnormal + result */ + assert(prec1 != BF_PREC_INF); + prec = prec1 - (e_min - r->expn); + } else { + prec = prec1; + } + + /* round to prec bits */ + rnd_mode = flags & BF_RND_MASK; + ret = 0; + add_one = bfdec_get_rnd_add(&ret, r, l, prec, rnd_mode); + + if (prec <= 0) { + if (add_one) { + bfdec_resize(r, 1); /* cannot fail because r is non zero */ + r->tab[0] = BF_DEC_BASE / 10; + r->expn += 1 - prec; + ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; + return ret; + } else { + goto underflow; + } + } else if (add_one) { + limb_t carry; + + /* add one starting at digit 'prec - 1' */ + bit_pos = l * LIMB_DIGITS - 1 - (prec - 1); + pos = bit_pos / LIMB_DIGITS; + carry = mp_pow_dec[bit_pos % LIMB_DIGITS]; + carry = mp_add_ui_dec(r->tab + pos, carry, l - pos); + if (carry) { + /* shift right by one digit */ + mp_shr_dec(r->tab + pos, r->tab + pos, l - pos, 1, 1); + r->expn++; + } + } + + /* check underflow */ + if (unlikely(r->expn < e_min)) { + if (flags & BF_FLAG_SUBNORMAL) { + /* if inexact, also set the underflow flag */ + if (ret & BF_ST_INEXACT) + ret |= BF_ST_UNDERFLOW; + } else { + underflow: + bfdec_set_zero(r, r->sign); + ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; + return ret; + } + } + + /* check overflow */ + if (unlikely(r->expn > e_max)) { + bfdec_set_inf(r, r->sign); + ret |= BF_ST_OVERFLOW | BF_ST_INEXACT; + return ret; + } + + /* keep the bits starting at 'prec - 1' */ + bit_pos = l * LIMB_DIGITS - 1 - (prec - 1); + i = floor_div(bit_pos, LIMB_DIGITS); + if (i >= 0) { + shift = smod(bit_pos, LIMB_DIGITS); + if (shift != 0) { + r->tab[i] = fast_shr_dec(r->tab[i], shift) * + mp_pow_dec[shift]; + } + } else { + i = 0; + } + /* remove trailing zeros */ + while (r->tab[i] == 0) + i++; + if (i > 0) { + l -= i; + memmove(r->tab, r->tab + i, l * sizeof(limb_t)); + } + bfdec_resize(r, l); /* cannot fail */ + return ret; +} + +/* Cannot fail with BF_ST_MEM_ERROR. */ +int bfdec_round(bfdec_t *r, limb_t prec, bf_flags_t flags) +{ + if (r->len == 0) + return 0; + return __bfdec_round(r, prec, flags, r->len); +} + +/* 'r' must be a finite number. Cannot fail with BF_ST_MEM_ERROR. */ +int bfdec_normalize_and_round(bfdec_t *r, limb_t prec1, bf_flags_t flags) +{ + limb_t l, v; + int shift, ret; + + // bfdec_print_str("bf_renorm", r); + l = r->len; + while (l > 0 && r->tab[l - 1] == 0) + l--; + if (l == 0) { + /* zero */ + r->expn = BF_EXP_ZERO; + bfdec_resize(r, 0); /* cannot fail */ + ret = 0; + } else { + r->expn -= (r->len - l) * LIMB_DIGITS; + /* shift to have the MSB set to '1' */ + v = r->tab[l - 1]; + shift = clz_dec(v); + if (shift != 0) { + mp_shl_dec(r->tab, r->tab, l, shift, 0); + r->expn -= shift; + } + ret = __bfdec_round(r, prec1, flags, l); + } + // bf_print_str("r_final", r); + return ret; +} + +int bfdec_set_ui(bfdec_t *r, uint64_t v) +{ +#if LIMB_BITS == 32 + if (v >= BF_DEC_BASE * BF_DEC_BASE) { + if (bfdec_resize(r, 3)) + goto fail; + r->tab[0] = v % BF_DEC_BASE; + v /= BF_DEC_BASE; + r->tab[1] = v % BF_DEC_BASE; + r->tab[2] = v / BF_DEC_BASE; + r->expn = 3 * LIMB_DIGITS; + } else +#endif + if (v >= BF_DEC_BASE) { + if (bfdec_resize(r, 2)) + goto fail; + r->tab[0] = v % BF_DEC_BASE; + r->tab[1] = v / BF_DEC_BASE; + r->expn = 2 * LIMB_DIGITS; + } else { + if (bfdec_resize(r, 1)) + goto fail; + r->tab[0] = v; + r->expn = LIMB_DIGITS; + } + r->sign = 0; + return bfdec_normalize_and_round(r, BF_PREC_INF, 0); + fail: + bfdec_set_nan(r); + return BF_ST_MEM_ERROR; +} + +int bfdec_set_si(bfdec_t *r, int64_t v) +{ + int ret; + if (v < 0) { + ret = bfdec_set_ui(r, -v); + r->sign = 1; + } else { + ret = bfdec_set_ui(r, v); + } + return ret; +} + +static int bfdec_add_internal(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, bf_flags_t flags, int b_neg) +{ + bf_context_t *s = r->ctx; + int is_sub, cmp_res, a_sign, b_sign, ret; + + a_sign = a->sign; + b_sign = b->sign ^ b_neg; + is_sub = a_sign ^ b_sign; + cmp_res = bfdec_cmpu(a, b); + if (cmp_res < 0) { + const bfdec_t *tmp; + tmp = a; + a = b; + b = tmp; + a_sign = b_sign; /* b_sign is never used later */ + } + /* abs(a) >= abs(b) */ + if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) { + /* zero result */ + bfdec_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD); + ret = 0; + } else if (a->len == 0 || b->len == 0) { + ret = 0; + if (a->expn >= BF_EXP_INF) { + if (a->expn == BF_EXP_NAN) { + /* at least one operand is NaN */ + bfdec_set_nan(r); + ret = 0; + } else if (b->expn == BF_EXP_INF && is_sub) { + /* infinities with different signs */ + bfdec_set_nan(r); + ret = BF_ST_INVALID_OP; + } else { + bfdec_set_inf(r, a_sign); + } + } else { + /* at least one zero and not subtract */ + if (bfdec_set(r, a)) + return BF_ST_MEM_ERROR; + r->sign = a_sign; + goto renorm; + } + } else { + slimb_t d, a_offset, b_offset, i, r_len; + limb_t carry; + limb_t *b1_tab; + int b_shift; + mp_size_t b1_len; + + d = a->expn - b->expn; + + /* XXX: not efficient in time and memory if the precision is + not infinite */ + r_len = bf_max(a->len, b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS); + if (bfdec_resize(r, r_len)) + goto fail; + r->sign = a_sign; + r->expn = a->expn; + + a_offset = r_len - a->len; + for(i = 0; i < a_offset; i++) + r->tab[i] = 0; + for(i = 0; i < a->len; i++) + r->tab[a_offset + i] = a->tab[i]; + + b_shift = d % LIMB_DIGITS; + if (b_shift == 0) { + b1_len = b->len; + b1_tab = (limb_t *)b->tab; + } else { + b1_len = b->len + 1; + b1_tab = bf_malloc(s, sizeof(limb_t) * b1_len); + if (!b1_tab) + goto fail; + b1_tab[0] = mp_shr_dec(b1_tab + 1, b->tab, b->len, b_shift, 0) * + mp_pow_dec[LIMB_DIGITS - b_shift]; + } + b_offset = r_len - (b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS); + + if (is_sub) { + carry = mp_sub_dec(r->tab + b_offset, r->tab + b_offset, + b1_tab, b1_len, 0); + if (carry != 0) { + carry = mp_sub_ui_dec(r->tab + b_offset + b1_len, carry, + r_len - (b_offset + b1_len)); + assert(carry == 0); + } + } else { + carry = mp_add_dec(r->tab + b_offset, r->tab + b_offset, + b1_tab, b1_len, 0); + if (carry != 0) { + carry = mp_add_ui_dec(r->tab + b_offset + b1_len, carry, + r_len - (b_offset + b1_len)); + } + if (carry != 0) { + if (bfdec_resize(r, r_len + 1)) { + if (b_shift != 0) + bf_free(s, b1_tab); + goto fail; + } + r->tab[r_len] = 1; + r->expn += LIMB_DIGITS; + } + } + if (b_shift != 0) + bf_free(s, b1_tab); + renorm: + ret = bfdec_normalize_and_round(r, prec, flags); + } + return ret; + fail: + bfdec_set_nan(r); + return BF_ST_MEM_ERROR; +} + +static int __bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, + bf_flags_t flags) +{ + return bfdec_add_internal(r, a, b, prec, flags, 0); +} + +static int __bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, + bf_flags_t flags) +{ + return bfdec_add_internal(r, a, b, prec, flags, 1); +} + +int bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags, + (bf_op2_func_t *)__bfdec_add); +} + +int bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags, + (bf_op2_func_t *)__bfdec_sub); +} + +int bfdec_mul(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, + bf_flags_t flags) +{ + int ret, r_sign; + + if (a->len < b->len) { + const bfdec_t *tmp = a; + a = b; + b = tmp; + } + r_sign = a->sign ^ b->sign; + /* here b->len <= a->len */ + if (b->len == 0) { + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + bfdec_set_nan(r); + ret = 0; + } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) { + if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) || + (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) { + bfdec_set_nan(r); + ret = BF_ST_INVALID_OP; + } else { + bfdec_set_inf(r, r_sign); + ret = 0; + } + } else { + bfdec_set_zero(r, r_sign); + ret = 0; + } + } else { + bfdec_t tmp, *r1 = NULL; + limb_t a_len, b_len; + limb_t *a_tab, *b_tab; + + a_len = a->len; + b_len = b->len; + a_tab = a->tab; + b_tab = b->tab; + + if (r == a || r == b) { + bfdec_init(r->ctx, &tmp); + r1 = r; + r = &tmp; + } + if (bfdec_resize(r, a_len + b_len)) { + bfdec_set_nan(r); + ret = BF_ST_MEM_ERROR; + goto done; + } + mp_mul_basecase_dec(r->tab, a_tab, a_len, b_tab, b_len); + r->sign = r_sign; + r->expn = a->expn + b->expn; + ret = bfdec_normalize_and_round(r, prec, flags); + done: + if (r == &tmp) + bfdec_move(r1, &tmp); + } + return ret; +} + +int bfdec_mul_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec, + bf_flags_t flags) +{ + bfdec_t b; + int ret; + bfdec_init(r->ctx, &b); + ret = bfdec_set_si(&b, b1); + ret |= bfdec_mul(r, a, &b, prec, flags); + bfdec_delete(&b); + return ret; +} + +int bfdec_add_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec, + bf_flags_t flags) +{ + bfdec_t b; + int ret; + + bfdec_init(r->ctx, &b); + ret = bfdec_set_si(&b, b1); + ret |= bfdec_add(r, a, &b, prec, flags); + bfdec_delete(&b); + return ret; +} + +static int __bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, + limb_t prec, bf_flags_t flags) +{ + int ret, r_sign; + limb_t n, nb, precl; + + r_sign = a->sign ^ b->sign; + if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) { + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + bfdec_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) { + bfdec_set_nan(r); + return BF_ST_INVALID_OP; + } else if (a->expn == BF_EXP_INF) { + bfdec_set_inf(r, r_sign); + return 0; + } else { + bfdec_set_zero(r, r_sign); + return 0; + } + } else if (a->expn == BF_EXP_ZERO) { + if (b->expn == BF_EXP_ZERO) { + bfdec_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bfdec_set_zero(r, r_sign); + return 0; + } + } else if (b->expn == BF_EXP_ZERO) { + bfdec_set_inf(r, r_sign); + return BF_ST_DIVIDE_ZERO; + } + + nb = b->len; + if (prec == BF_PREC_INF) { + /* infinite precision: return BF_ST_INVALID_OP if not an exact + result */ + /* XXX: check */ + precl = nb + 1; + } else if (flags & BF_FLAG_RADPNT_PREC) { + /* number of digits after the decimal point */ + /* XXX: check (2 extra digits for rounding + 2 digits) */ + precl = (bf_max(a->expn - b->expn, 0) + 2 + + prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS; + } else { + /* number of limbs of the quotient (2 extra digits for rounding) */ + precl = (prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS; + } + n = bf_max(a->len, precl); + + { + limb_t *taba, na, i; + slimb_t d; + + na = n + nb; + taba = bf_malloc(r->ctx, (na + 1) * sizeof(limb_t)); + if (!taba) + goto fail; + d = na - a->len; + memset(taba, 0, d * sizeof(limb_t)); + memcpy(taba + d, a->tab, a->len * sizeof(limb_t)); + if (bfdec_resize(r, n + 1)) + goto fail1; + if (mp_div_dec(r->ctx, r->tab, taba, na, b->tab, nb)) { + fail1: + bf_free(r->ctx, taba); + goto fail; + } + /* see if non zero remainder */ + for(i = 0; i < nb; i++) { + if (taba[i] != 0) + break; + } + bf_free(r->ctx, taba); + if (i != nb) { + if (prec == BF_PREC_INF) { + bfdec_set_nan(r); + return BF_ST_INVALID_OP; + } else { + r->tab[0] |= 1; + } + } + r->expn = a->expn - b->expn + LIMB_DIGITS; + r->sign = r_sign; + ret = bfdec_normalize_and_round(r, prec, flags); + } + return ret; + fail: + bfdec_set_nan(r); + return BF_ST_MEM_ERROR; +} + +int bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, + bf_flags_t flags) +{ + return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags, + (bf_op2_func_t *)__bfdec_div); +} + +/* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the + integer defined as floor(a/b) and r = a - q * b. */ +static void bfdec_tdivremu(bf_context_t *s, bfdec_t *q, bfdec_t *r, + const bfdec_t *a, const bfdec_t *b) +{ + if (bfdec_cmpu(a, b) < 0) { + bfdec_set_ui(q, 0); + bfdec_set(r, a); + } else { + bfdec_div(q, a, b, 0, BF_RNDZ | BF_FLAG_RADPNT_PREC); + bfdec_mul(r, q, b, BF_PREC_INF, BF_RNDZ); + bfdec_sub(r, a, r, BF_PREC_INF, BF_RNDZ); + } +} + +/* division and remainder. + + rnd_mode is the rounding mode for the quotient. The additional + rounding mode BF_RND_EUCLIDIAN is supported. + + 'q' is an integer. 'r' is rounded with prec and flags (prec can be + BF_PREC_INF). +*/ +int bfdec_divrem(bfdec_t *q, bfdec_t *r, const bfdec_t *a, const bfdec_t *b, + limb_t prec, bf_flags_t flags, int rnd_mode) +{ + bf_context_t *s = q->ctx; + bfdec_t a1_s, *a1 = &a1_s; + bfdec_t b1_s, *b1 = &b1_s; + bfdec_t r1_s, *r1 = &r1_s; + int q_sign, res; + BOOL is_ceil, is_rndn; + + assert(q != a && q != b); + assert(r != a && r != b); + assert(q != r); + + if (a->len == 0 || b->len == 0) { + bfdec_set_zero(q, 0); + if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { + bfdec_set_nan(r); + return 0; + } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) { + bfdec_set_nan(r); + return BF_ST_INVALID_OP; + } else { + bfdec_set(r, a); + return bfdec_round(r, prec, flags); + } + } + + q_sign = a->sign ^ b->sign; + is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); + switch(rnd_mode) { + default: + case BF_RNDZ: + case BF_RNDN: + case BF_RNDNA: + is_ceil = FALSE; + break; + case BF_RNDD: + is_ceil = q_sign; + break; + case BF_RNDU: + is_ceil = q_sign ^ 1; + break; + case BF_RNDA: + is_ceil = TRUE; + break; + case BF_DIVREM_EUCLIDIAN: + is_ceil = a->sign; + break; + } + + a1->expn = a->expn; + a1->tab = a->tab; + a1->len = a->len; + a1->sign = 0; + + b1->expn = b->expn; + b1->tab = b->tab; + b1->len = b->len; + b1->sign = 0; + + // bfdec_print_str("a1", a1); + // bfdec_print_str("b1", b1); + /* XXX: could improve to avoid having a large 'q' */ + bfdec_tdivremu(s, q, r, a1, b1); + if (bfdec_is_nan(q) || bfdec_is_nan(r)) + goto fail; + // bfdec_print_str("q", q); + // bfdec_print_str("r", r); + + if (r->len != 0) { + if (is_rndn) { + bfdec_init(s, r1); + if (bfdec_set(r1, r)) + goto fail; + if (bfdec_mul_si(r1, r1, 2, BF_PREC_INF, BF_RNDZ)) { + bfdec_delete(r1); + goto fail; + } + res = bfdec_cmpu(r1, b); + bfdec_delete(r1); + if (res > 0 || + (res == 0 && + (rnd_mode == BF_RNDNA || + (get_digit(q->tab, q->len, q->len * LIMB_DIGITS - q->expn) & 1) != 0))) { + goto do_sub_r; + } + } else if (is_ceil) { + do_sub_r: + res = bfdec_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ); + res |= bfdec_sub(r, r, b1, BF_PREC_INF, BF_RNDZ); + if (res & BF_ST_MEM_ERROR) + goto fail; + } + } + + r->sign ^= a->sign; + q->sign = q_sign; + return bfdec_round(r, prec, flags); + fail: + bfdec_set_nan(q); + bfdec_set_nan(r); + return BF_ST_MEM_ERROR; +} + +int bfdec_rem(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, + bf_flags_t flags, int rnd_mode) +{ + bfdec_t q_s, *q = &q_s; + int ret; + + bfdec_init(r->ctx, q); + ret = bfdec_divrem(q, r, a, b, prec, flags, rnd_mode); + bfdec_delete(q); + return ret; +} + +/* convert to integer (infinite precision) */ +int bfdec_rint(bfdec_t *r, int rnd_mode) +{ + return bfdec_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC); +} + +int bfdec_sqrt(bfdec_t *r, const bfdec_t *a, limb_t prec, bf_flags_t flags) +{ + bf_context_t *s = a->ctx; + int ret, k; + limb_t *a1, v; + slimb_t n, n1, prec1; + limb_t res; + + assert(r != a); + + if (a->len == 0) { + if (a->expn == BF_EXP_NAN) { + bfdec_set_nan(r); + } else if (a->expn == BF_EXP_INF && a->sign) { + goto invalid_op; + } else { + bfdec_set(r, a); + } + ret = 0; + } else if (a->sign || prec == BF_PREC_INF) { + invalid_op: + bfdec_set_nan(r); + ret = BF_ST_INVALID_OP; + } else { + if (flags & BF_FLAG_RADPNT_PREC) { + prec1 = bf_max(floor_div(a->expn + 1, 2) + prec, 1); + } else { + prec1 = prec; + } + /* convert the mantissa to an integer with at least 2 * + prec + 4 digits */ + n = (2 * (prec1 + 2) + 2 * LIMB_DIGITS - 1) / (2 * LIMB_DIGITS); + if (bfdec_resize(r, n)) + goto fail; + a1 = bf_malloc(s, sizeof(limb_t) * 2 * n); + if (!a1) + goto fail; + n1 = bf_min(2 * n, a->len); + memset(a1, 0, (2 * n - n1) * sizeof(limb_t)); + memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t)); + if (a->expn & 1) { + res = mp_shr_dec(a1, a1, 2 * n, 1, 0); + } else { + res = 0; + } + /* normalize so that a1 >= B^(2*n)/4. Not need for n = 1 + because mp_sqrtrem2_dec already does it */ + k = 0; + if (n > 1) { + v = a1[2 * n - 1]; + while (v < BF_DEC_BASE / 4) { + k++; + v *= 4; + } + if (k != 0) + mp_mul1_dec(a1, a1, 2 * n, 1 << (2 * k), 0); + } + if (mp_sqrtrem_dec(s, r->tab, a1, n)) { + bf_free(s, a1); + goto fail; + } + if (k != 0) + mp_div1_dec(r->tab, r->tab, n, 1 << k, 0); + if (!res) { + res = mp_scan_nz(a1, n + 1); + } + bf_free(s, a1); + if (!res) { + res = mp_scan_nz(a->tab, a->len - n1); + } + if (res != 0) + r->tab[0] |= 1; + r->sign = 0; + r->expn = (a->expn + 1) >> 1; + ret = bfdec_round(r, prec, flags); + } + return ret; + fail: + bfdec_set_nan(r); + return BF_ST_MEM_ERROR; +} + +/* The rounding mode is always BF_RNDZ. Return BF_ST_OVERFLOW if there + is an overflow and 0 otherwise. No memory error is possible. */ +int bfdec_get_int32(int *pres, const bfdec_t *a) +{ + uint32_t v; + int ret; + if (a->expn >= BF_EXP_INF) { + ret = 0; + if (a->expn == BF_EXP_INF) { + v = (uint32_t)INT32_MAX + a->sign; + /* XXX: return overflow ? */ + } else { + v = INT32_MAX; + } + } else if (a->expn <= 0) { + v = 0; + ret = 0; + } else if (a->expn <= 9) { + v = fast_shr_dec(a->tab[a->len - 1], LIMB_DIGITS - a->expn); + if (a->sign) + v = -v; + ret = 0; + } else if (a->expn == 10) { + uint64_t v1; + uint32_t v_max; +#if LIMB_BITS == 64 + v1 = fast_shr_dec(a->tab[a->len - 1], LIMB_DIGITS - a->expn); +#else + v1 = (uint64_t)a->tab[a->len - 1] * 10 + + get_digit(a->tab, a->len, (a->len - 1) * LIMB_DIGITS - 1); +#endif + v_max = (uint32_t)INT32_MAX + a->sign; + if (v1 > v_max) { + v = v_max; + ret = BF_ST_OVERFLOW; + } else { + v = v1; + if (a->sign) + v = -v; + ret = 0; + } + } else { + v = (uint32_t)INT32_MAX + a->sign; + ret = BF_ST_OVERFLOW; + } + *pres = v; + return ret; +} + +/* power to an integer with infinite precision */ +int bfdec_pow_ui(bfdec_t *r, const bfdec_t *a, limb_t b) +{ + int ret, n_bits, i; + + assert(r != a); + if (b == 0) + return bfdec_set_ui(r, 1); + ret = bfdec_set(r, a); + n_bits = LIMB_BITS - clz(b); + for(i = n_bits - 2; i >= 0; i--) { + ret |= bfdec_mul(r, r, r, BF_PREC_INF, BF_RNDZ); + if ((b >> i) & 1) + ret |= bfdec_mul(r, r, a, BF_PREC_INF, BF_RNDZ); + } + return ret; +} + +char *bfdec_ftoa(size_t *plen, const bfdec_t *a, limb_t prec, bf_flags_t flags) +{ + return bf_ftoa_internal(plen, (const bf_t *)a, 10, prec, flags, TRUE); +} + +int bfdec_atof(bfdec_t *r, const char *str, const char **pnext, + limb_t prec, bf_flags_t flags) +{ + slimb_t dummy_exp; + return bf_atof_internal((bf_t *)r, &dummy_exp, str, pnext, 10, prec, + flags, TRUE); +} + +#endif /* USE_BF_DEC */ + +#ifdef USE_FFT_MUL +/***************************************************************/ +/* Integer multiplication with FFT */ + +/* or LIMB_BITS at bit position 'pos' in tab */ +static inline void put_bits(limb_t *tab, limb_t len, slimb_t pos, limb_t val) +{ + limb_t i; + int p; + + i = pos >> LIMB_LOG2_BITS; + p = pos & (LIMB_BITS - 1); + if (i < len) + tab[i] |= val << p; + if (p != 0) { + i++; + if (i < len) { + tab[i] |= val >> (LIMB_BITS - p); + } + } +} + +#if defined(__AVX2__) + +typedef double NTTLimb; + +/* we must have: modulo >= 1 << NTT_MOD_LOG2_MIN */ +#define NTT_MOD_LOG2_MIN 50 +#define NTT_MOD_LOG2_MAX 51 +#define NB_MODS 5 +#define NTT_PROOT_2EXP 39 +static const int ntt_int_bits[NB_MODS] = { 254, 203, 152, 101, 50, }; + +static const limb_t ntt_mods[NB_MODS] = { 0x00073a8000000001, 0x0007858000000001, 0x0007a38000000001, 0x0007a68000000001, 0x0007fd8000000001, +}; + +static const limb_t ntt_proot[2][NB_MODS] = { + { 0x00056198d44332c8, 0x0002eb5d640aad39, 0x00047e31eaa35fd0, 0x0005271ac118a150, 0x00075e0ce8442bd5, }, + { 0x000461169761bcc5, 0x0002dac3cb2da688, 0x0004abc97751e3bf, 0x000656778fc8c485, 0x0000dc6469c269fa, }, +}; + +static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = { + 0x00020e4da740da8e, 0x0004c3dc09c09c1d, 0x000063bd097b4271, 0x000799d8f18f18fd, + 0x0005384222222264, 0x000572b07c1f07fe, 0x00035cd08888889a, + 0x00066015555557e3, 0x000725960b60b623, + 0x0002fc1fa1d6ce12, +}; + +#else + +typedef limb_t NTTLimb; + +#if LIMB_BITS == 64 + +#define NTT_MOD_LOG2_MIN 61 +#define NTT_MOD_LOG2_MAX 62 +#define NB_MODS 5 +#define NTT_PROOT_2EXP 51 +static const int ntt_int_bits[NB_MODS] = { 307, 246, 185, 123, 61, }; + +static const limb_t ntt_mods[NB_MODS] = { 0x28d8000000000001, 0x2a88000000000001, 0x2ed8000000000001, 0x3508000000000001, 0x3aa8000000000001, +}; + +static const limb_t ntt_proot[2][NB_MODS] = { + { 0x1b8ea61034a2bea7, 0x21a9762de58206fb, 0x02ca782f0756a8ea, 0x278384537a3e50a1, 0x106e13fee74ce0ab, }, + { 0x233513af133e13b8, 0x1d13140d1c6f75f1, 0x12cde57f97e3eeda, 0x0d6149e23cbe654f, 0x36cd204f522a1379, }, +}; + +static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = { + 0x08a9ed097b425eea, 0x18a44aaaaaaaaab3, 0x2493f57f57f57f5d, 0x126b8d0649a7f8d4, + 0x09d80ed7303b5ccc, 0x25b8bcf3cf3cf3d5, 0x2ce6ce63398ce638, + 0x0e31fad40a57eb59, 0x02a3529fd4a7f52f, + 0x3a5493e93e93e94a, +}; + +#elif LIMB_BITS == 32 + +/* we must have: modulo >= 1 << NTT_MOD_LOG2_MIN */ +#define NTT_MOD_LOG2_MIN 29 +#define NTT_MOD_LOG2_MAX 30 +#define NB_MODS 5 +#define NTT_PROOT_2EXP 20 +static const int ntt_int_bits[NB_MODS] = { 148, 119, 89, 59, 29, }; + +static const limb_t ntt_mods[NB_MODS] = { 0x0000000032b00001, 0x0000000033700001, 0x0000000036d00001, 0x0000000037300001, 0x000000003e500001, +}; + +static const limb_t ntt_proot[2][NB_MODS] = { + { 0x0000000032525f31, 0x0000000005eb3b37, 0x00000000246eda9f, 0x0000000035f25901, 0x00000000022f5768, }, + { 0x00000000051eba1a, 0x00000000107be10e, 0x000000001cd574e0, 0x00000000053806e6, 0x000000002cd6bf98, }, +}; + +static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = { + 0x000000000449559a, 0x000000001eba6ca9, 0x000000002ec18e46, 0x000000000860160b, + 0x000000000d321307, 0x000000000bf51120, 0x000000000f662938, + 0x000000000932ab3e, 0x000000002f40eef8, + 0x000000002e760905, +}; + +#endif /* LIMB_BITS */ + +#endif /* !AVX2 */ + +#if defined(__AVX2__) +#define NTT_TRIG_K_MAX 18 +#else +#define NTT_TRIG_K_MAX 19 +#endif + +typedef struct BFNTTState { + bf_context_t *ctx; + + /* used for mul_mod_fast() */ + limb_t ntt_mods_div[NB_MODS]; + + limb_t ntt_proot_pow[NB_MODS][2][NTT_PROOT_2EXP + 1]; + limb_t ntt_proot_pow_inv[NB_MODS][2][NTT_PROOT_2EXP + 1]; + NTTLimb *ntt_trig[NB_MODS][2][NTT_TRIG_K_MAX + 1]; + /* 1/2^n mod m */ + limb_t ntt_len_inv[NB_MODS][NTT_PROOT_2EXP + 1][2]; +#if defined(__AVX2__) + __m256d ntt_mods_cr_vec[NB_MODS * (NB_MODS - 1) / 2]; + __m256d ntt_mods_vec[NB_MODS]; + __m256d ntt_mods_inv_vec[NB_MODS]; +#else + limb_t ntt_mods_cr_inv[NB_MODS * (NB_MODS - 1) / 2]; +#endif +} BFNTTState; + +static NTTLimb *get_trig(BFNTTState *s, int k, int inverse, int m_idx); + +/* add modulo with up to (LIMB_BITS-1) bit modulo */ +static inline limb_t add_mod(limb_t a, limb_t b, limb_t m) +{ + limb_t r; + r = a + b; + if (r >= m) + r -= m; + return r; +} + +/* sub modulo with up to LIMB_BITS bit modulo */ +static inline limb_t sub_mod(limb_t a, limb_t b, limb_t m) +{ + limb_t r; + r = a - b; + if (r > a) + r += m; + return r; +} + +/* return (r0+r1*B) mod m + precondition: 0 <= r0+r1*B < 2^(64+NTT_MOD_LOG2_MIN) +*/ +static inline limb_t mod_fast(dlimb_t r, + limb_t m, limb_t m_inv) +{ + limb_t a1, q, t0, r1, r0; + + a1 = r >> NTT_MOD_LOG2_MIN; + + q = ((dlimb_t)a1 * m_inv) >> LIMB_BITS; + r = r - (dlimb_t)q * m - m * 2; + r1 = r >> LIMB_BITS; + t0 = (slimb_t)r1 >> 1; + r += m & t0; + r0 = r; + r1 = r >> LIMB_BITS; + r0 += m & r1; + return r0; +} + +/* faster version using precomputed modulo inverse. + precondition: 0 <= a * b < 2^(64+NTT_MOD_LOG2_MIN) */ +static inline limb_t mul_mod_fast(limb_t a, limb_t b, + limb_t m, limb_t m_inv) +{ + dlimb_t r; + r = (dlimb_t)a * (dlimb_t)b; + return mod_fast(r, m, m_inv); +} + +static inline limb_t init_mul_mod_fast(limb_t m) +{ + dlimb_t t; + assert(m < (limb_t)1 << NTT_MOD_LOG2_MAX); + assert(m >= (limb_t)1 << NTT_MOD_LOG2_MIN); + t = (dlimb_t)1 << (LIMB_BITS + NTT_MOD_LOG2_MIN); + return t / m; +} + +/* Faster version used when the multiplier is constant. 0 <= a < 2^64, + 0 <= b < m. */ +static inline limb_t mul_mod_fast2(limb_t a, limb_t b, + limb_t m, limb_t b_inv) +{ + limb_t r, q; + + q = ((dlimb_t)a * (dlimb_t)b_inv) >> LIMB_BITS; + r = a * b - q * m; + if (r >= m) + r -= m; + return r; +} + +/* Faster version used when the multiplier is constant. 0 <= a < 2^64, + 0 <= b < m. Let r = a * b mod m. The return value is 'r' or 'r + + m'. */ +static inline limb_t mul_mod_fast3(limb_t a, limb_t b, + limb_t m, limb_t b_inv) +{ + limb_t r, q; + + q = ((dlimb_t)a * (dlimb_t)b_inv) >> LIMB_BITS; + r = a * b - q * m; + return r; +} + +static inline limb_t init_mul_mod_fast2(limb_t b, limb_t m) +{ + return ((dlimb_t)b << LIMB_BITS) / m; +} + +#ifdef __AVX2__ + +static inline limb_t ntt_limb_to_int(NTTLimb a, limb_t m) +{ + slimb_t v; + v = a; + if (v < 0) + v += m; + if (v >= m) + v -= m; + return v; +} + +static inline NTTLimb int_to_ntt_limb(limb_t a, limb_t m) +{ + return (slimb_t)a; +} + +static inline NTTLimb int_to_ntt_limb2(limb_t a, limb_t m) +{ + if (a >= (m / 2)) + a -= m; + return (slimb_t)a; +} + +/* return r + m if r < 0 otherwise r. */ +static inline __m256d ntt_mod1(__m256d r, __m256d m) +{ + return _mm256_blendv_pd(r, r + m, r); +} + +/* input: abs(r) < 2 * m. Output: abs(r) < m */ +static inline __m256d ntt_mod(__m256d r, __m256d mf, __m256d m2f) +{ + return _mm256_blendv_pd(r, r + m2f, r) - mf; +} + +/* input: abs(a*b) < 2 * m^2, output: abs(r) < m */ +static inline __m256d ntt_mul_mod(__m256d a, __m256d b, __m256d mf, + __m256d m_inv) +{ + __m256d r, q, ab1, ab0, qm0, qm1; + ab1 = a * b; + q = _mm256_round_pd(ab1 * m_inv, 0); /* round to nearest */ + qm1 = q * mf; + qm0 = _mm256_fmsub_pd(q, mf, qm1); /* low part */ + ab0 = _mm256_fmsub_pd(a, b, ab1); /* low part */ + r = (ab1 - qm1) + (ab0 - qm0); + return r; +} + +static void *bf_aligned_malloc(bf_context_t *s, size_t size, size_t align) +{ + void *ptr; + void **ptr1; + ptr = bf_malloc(s, size + sizeof(void *) + align - 1); + if (!ptr) + return NULL; + ptr1 = (void **)(((uintptr_t)ptr + sizeof(void *) + align - 1) & + ~(align - 1)); + ptr1[-1] = ptr; + return ptr1; +} + +static void bf_aligned_free(bf_context_t *s, void *ptr) +{ + if (!ptr) + return; + bf_free(s, ((void **)ptr)[-1]); +} + +static void *ntt_malloc(BFNTTState *s, size_t size) +{ + return bf_aligned_malloc(s->ctx, size, 64); +} + +static void ntt_free(BFNTTState *s, void *ptr) +{ + bf_aligned_free(s->ctx, ptr); +} + +static no_inline int ntt_fft(BFNTTState *s, + NTTLimb *out_buf, NTTLimb *in_buf, + NTTLimb *tmp_buf, int fft_len_log2, + int inverse, int m_idx) +{ + limb_t nb_blocks, fft_per_block, p, k, n, stride_in, i, j; + NTTLimb *tab_in, *tab_out, *tmp, *trig; + __m256d m_inv, mf, m2f, c, a0, a1, b0, b1; + limb_t m; + int l; + + m = ntt_mods[m_idx]; + + m_inv = _mm256_set1_pd(1.0 / (double)m); + mf = _mm256_set1_pd(m); + m2f = _mm256_set1_pd(m * 2); + + n = (limb_t)1 << fft_len_log2; + assert(n >= 8); + stride_in = n / 2; + + tab_in = in_buf; + tab_out = tmp_buf; + trig = get_trig(s, fft_len_log2, inverse, m_idx); + if (!trig) + return -1; + p = 0; + for(k = 0; k < stride_in; k += 4) { + a0 = _mm256_load_pd(&tab_in[k]); + a1 = _mm256_load_pd(&tab_in[k + stride_in]); + c = _mm256_load_pd(trig); + trig += 4; + b0 = ntt_mod(a0 + a1, mf, m2f); + b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv); + a0 = _mm256_permute2f128_pd(b0, b1, 0x20); + a1 = _mm256_permute2f128_pd(b0, b1, 0x31); + a0 = _mm256_permute4x64_pd(a0, 0xd8); + a1 = _mm256_permute4x64_pd(a1, 0xd8); + _mm256_store_pd(&tab_out[p], a0); + _mm256_store_pd(&tab_out[p + 4], a1); + p += 2 * 4; + } + tmp = tab_in; + tab_in = tab_out; + tab_out = tmp; + + trig = get_trig(s, fft_len_log2 - 1, inverse, m_idx); + if (!trig) + return -1; + p = 0; + for(k = 0; k < stride_in; k += 4) { + a0 = _mm256_load_pd(&tab_in[k]); + a1 = _mm256_load_pd(&tab_in[k + stride_in]); + c = _mm256_setr_pd(trig[0], trig[0], trig[1], trig[1]); + trig += 2; + b0 = ntt_mod(a0 + a1, mf, m2f); + b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv); + a0 = _mm256_permute2f128_pd(b0, b1, 0x20); + a1 = _mm256_permute2f128_pd(b0, b1, 0x31); + _mm256_store_pd(&tab_out[p], a0); + _mm256_store_pd(&tab_out[p + 4], a1); + p += 2 * 4; + } + tmp = tab_in; + tab_in = tab_out; + tab_out = tmp; + + nb_blocks = n / 4; + fft_per_block = 4; + + l = fft_len_log2 - 2; + while (nb_blocks != 2) { + nb_blocks >>= 1; + p = 0; + k = 0; + trig = get_trig(s, l, inverse, m_idx); + if (!trig) + return -1; + for(i = 0; i < nb_blocks; i++) { + c = _mm256_set1_pd(trig[0]); + trig++; + for(j = 0; j < fft_per_block; j += 4) { + a0 = _mm256_load_pd(&tab_in[k + j]); + a1 = _mm256_load_pd(&tab_in[k + j + stride_in]); + b0 = ntt_mod(a0 + a1, mf, m2f); + b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv); + _mm256_store_pd(&tab_out[p + j], b0); + _mm256_store_pd(&tab_out[p + j + fft_per_block], b1); + } + k += fft_per_block; + p += 2 * fft_per_block; + } + fft_per_block <<= 1; + l--; + tmp = tab_in; + tab_in = tab_out; + tab_out = tmp; + } + + tab_out = out_buf; + for(k = 0; k < stride_in; k += 4) { + a0 = _mm256_load_pd(&tab_in[k]); + a1 = _mm256_load_pd(&tab_in[k + stride_in]); + b0 = ntt_mod(a0 + a1, mf, m2f); + b1 = ntt_mod(a0 - a1, mf, m2f); + _mm256_store_pd(&tab_out[k], b0); + _mm256_store_pd(&tab_out[k + stride_in], b1); + } + return 0; +} + +static void ntt_vec_mul(BFNTTState *s, + NTTLimb *tab1, NTTLimb *tab2, limb_t fft_len_log2, + int k_tot, int m_idx) +{ + limb_t i, c_inv, n, m; + __m256d m_inv, mf, a, b, c; + + m = ntt_mods[m_idx]; + c_inv = s->ntt_len_inv[m_idx][k_tot][0]; + m_inv = _mm256_set1_pd(1.0 / (double)m); + mf = _mm256_set1_pd(m); + c = _mm256_set1_pd(int_to_ntt_limb(c_inv, m)); + n = (limb_t)1 << fft_len_log2; + for(i = 0; i < n; i += 4) { + a = _mm256_load_pd(&tab1[i]); + b = _mm256_load_pd(&tab2[i]); + a = ntt_mul_mod(a, b, mf, m_inv); + a = ntt_mul_mod(a, c, mf, m_inv); + _mm256_store_pd(&tab1[i], a); + } +} + +static no_inline void mul_trig(NTTLimb *buf, + limb_t n, limb_t c1, limb_t m, limb_t m_inv1) +{ + limb_t i, c2, c3, c4; + __m256d c, c_mul, a0, mf, m_inv; + assert(n >= 2); + + mf = _mm256_set1_pd(m); + m_inv = _mm256_set1_pd(1.0 / (double)m); + + c2 = mul_mod_fast(c1, c1, m, m_inv1); + c3 = mul_mod_fast(c2, c1, m, m_inv1); + c4 = mul_mod_fast(c2, c2, m, m_inv1); + c = _mm256_setr_pd(1, int_to_ntt_limb(c1, m), + int_to_ntt_limb(c2, m), int_to_ntt_limb(c3, m)); + c_mul = _mm256_set1_pd(int_to_ntt_limb(c4, m)); + for(i = 0; i < n; i += 4) { + a0 = _mm256_load_pd(&buf[i]); + a0 = ntt_mul_mod(a0, c, mf, m_inv); + _mm256_store_pd(&buf[i], a0); + c = ntt_mul_mod(c, c_mul, mf, m_inv); + } +} + +#else + +static void *ntt_malloc(BFNTTState *s, size_t size) +{ + return bf_malloc(s->ctx, size); +} + +static void ntt_free(BFNTTState *s, void *ptr) +{ + bf_free(s->ctx, ptr); +} + +static inline limb_t ntt_limb_to_int(NTTLimb a, limb_t m) +{ + if (a >= m) + a -= m; + return a; +} + +static inline NTTLimb int_to_ntt_limb(slimb_t a, limb_t m) +{ + return a; +} + +static no_inline int ntt_fft(BFNTTState *s, NTTLimb *out_buf, NTTLimb *in_buf, + NTTLimb *tmp_buf, int fft_len_log2, + int inverse, int m_idx) +{ + limb_t nb_blocks, fft_per_block, p, k, n, stride_in, i, j, m, m2; + NTTLimb *tab_in, *tab_out, *tmp, a0, a1, b0, b1, c, *trig, c_inv; + int l; + + m = ntt_mods[m_idx]; + m2 = 2 * m; + n = (limb_t)1 << fft_len_log2; + nb_blocks = n; + fft_per_block = 1; + stride_in = n / 2; + tab_in = in_buf; + tab_out = tmp_buf; + l = fft_len_log2; + while (nb_blocks != 2) { + nb_blocks >>= 1; + p = 0; + k = 0; + trig = get_trig(s, l, inverse, m_idx); + if (!trig) + return -1; + for(i = 0; i < nb_blocks; i++) { + c = trig[0]; + c_inv = trig[1]; + trig += 2; + for(j = 0; j < fft_per_block; j++) { + a0 = tab_in[k + j]; + a1 = tab_in[k + j + stride_in]; + b0 = add_mod(a0, a1, m2); + b1 = a0 - a1 + m2; + b1 = mul_mod_fast3(b1, c, m, c_inv); + tab_out[p + j] = b0; + tab_out[p + j + fft_per_block] = b1; + } + k += fft_per_block; + p += 2 * fft_per_block; + } + fft_per_block <<= 1; + l--; + tmp = tab_in; + tab_in = tab_out; + tab_out = tmp; + } + /* no twiddle in last step */ + tab_out = out_buf; + for(k = 0; k < stride_in; k++) { + a0 = tab_in[k]; + a1 = tab_in[k + stride_in]; + b0 = add_mod(a0, a1, m2); + b1 = sub_mod(a0, a1, m2); + tab_out[k] = b0; + tab_out[k + stride_in] = b1; + } + return 0; +} + +static void ntt_vec_mul(BFNTTState *s, + NTTLimb *tab1, NTTLimb *tab2, int fft_len_log2, + int k_tot, int m_idx) +{ + limb_t i, norm, norm_inv, a, n, m, m_inv; + + m = ntt_mods[m_idx]; + m_inv = s->ntt_mods_div[m_idx]; + norm = s->ntt_len_inv[m_idx][k_tot][0]; + norm_inv = s->ntt_len_inv[m_idx][k_tot][1]; + n = (limb_t)1 << fft_len_log2; + for(i = 0; i < n; i++) { + a = tab1[i]; + /* need to reduce the range so that the product is < + 2^(LIMB_BITS+NTT_MOD_LOG2_MIN) */ + if (a >= m) + a -= m; + a = mul_mod_fast(a, tab2[i], m, m_inv); + a = mul_mod_fast3(a, norm, m, norm_inv); + tab1[i] = a; + } +} + +static no_inline void mul_trig(NTTLimb *buf, + limb_t n, limb_t c_mul, limb_t m, limb_t m_inv) +{ + limb_t i, c0, c_mul_inv; + + c0 = 1; + c_mul_inv = init_mul_mod_fast2(c_mul, m); + for(i = 0; i < n; i++) { + buf[i] = mul_mod_fast(buf[i], c0, m, m_inv); + c0 = mul_mod_fast2(c0, c_mul, m, c_mul_inv); + } +} + +#endif /* !AVX2 */ + +static no_inline NTTLimb *get_trig(BFNTTState *s, + int k, int inverse, int m_idx) +{ + NTTLimb *tab; + limb_t i, n2, c, c_mul, m, c_mul_inv; + + if (k > NTT_TRIG_K_MAX) + return NULL; + + tab = s->ntt_trig[m_idx][inverse][k]; + if (tab) + return tab; + n2 = (limb_t)1 << (k - 1); + m = ntt_mods[m_idx]; +#ifdef __AVX2__ + tab = ntt_malloc(s, sizeof(NTTLimb) * n2); +#else + tab = ntt_malloc(s, sizeof(NTTLimb) * n2 * 2); +#endif + if (!tab) + return NULL; + c = 1; + c_mul = s->ntt_proot_pow[m_idx][inverse][k]; + c_mul_inv = s->ntt_proot_pow_inv[m_idx][inverse][k]; + for(i = 0; i < n2; i++) { +#ifdef __AVX2__ + tab[i] = int_to_ntt_limb2(c, m); +#else + tab[2 * i] = int_to_ntt_limb(c, m); + tab[2 * i + 1] = init_mul_mod_fast2(c, m); +#endif + c = mul_mod_fast2(c, c_mul, m, c_mul_inv); + } + s->ntt_trig[m_idx][inverse][k] = tab; + return tab; +} + +void fft_clear_cache(bf_context_t *s1) +{ + int m_idx, inverse, k; + BFNTTState *s = s1->ntt_state; + if (s) { + for(m_idx = 0; m_idx < NB_MODS; m_idx++) { + for(inverse = 0; inverse < 2; inverse++) { + for(k = 0; k < NTT_TRIG_K_MAX + 1; k++) { + if (s->ntt_trig[m_idx][inverse][k]) { + ntt_free(s, s->ntt_trig[m_idx][inverse][k]); + s->ntt_trig[m_idx][inverse][k] = NULL; + } + } + } + } +#if defined(__AVX2__) + bf_aligned_free(s1, s); +#else + bf_free(s1, s); +#endif + s1->ntt_state = NULL; + } +} + +#define STRIP_LEN 16 + +/* dst = buf1, src = buf2 */ +static int ntt_fft_partial(BFNTTState *s, NTTLimb *buf1, + int k1, int k2, limb_t n1, limb_t n2, int inverse, + limb_t m_idx) +{ + limb_t i, j, c_mul, c0, m, m_inv, strip_len, l; + NTTLimb *buf2, *buf3; + + buf2 = NULL; + buf3 = ntt_malloc(s, sizeof(NTTLimb) * n1); + if (!buf3) + goto fail; + if (k2 == 0) { + if (ntt_fft(s, buf1, buf1, buf3, k1, inverse, m_idx)) + goto fail; + } else { + strip_len = STRIP_LEN; + buf2 = ntt_malloc(s, sizeof(NTTLimb) * n1 * strip_len); + if (!buf2) + goto fail; + m = ntt_mods[m_idx]; + m_inv = s->ntt_mods_div[m_idx]; + c0 = s->ntt_proot_pow[m_idx][inverse][k1 + k2]; + c_mul = 1; + assert((n2 % strip_len) == 0); + for(j = 0; j < n2; j += strip_len) { + for(i = 0; i < n1; i++) { + for(l = 0; l < strip_len; l++) { + buf2[i + l * n1] = buf1[i * n2 + (j + l)]; + } + } + for(l = 0; l < strip_len; l++) { + if (inverse) + mul_trig(buf2 + l * n1, n1, c_mul, m, m_inv); + if (ntt_fft(s, buf2 + l * n1, buf2 + l * n1, buf3, k1, inverse, m_idx)) + goto fail; + if (!inverse) + mul_trig(buf2 + l * n1, n1, c_mul, m, m_inv); + c_mul = mul_mod_fast(c_mul, c0, m, m_inv); + } + + for(i = 0; i < n1; i++) { + for(l = 0; l < strip_len; l++) { + buf1[i * n2 + (j + l)] = buf2[i + l *n1]; + } + } + } + ntt_free(s, buf2); + } + ntt_free(s, buf3); + return 0; + fail: + ntt_free(s, buf2); + ntt_free(s, buf3); + return -1; +} + + +/* dst = buf1, src = buf2, tmp = buf3 */ +static int ntt_conv(BFNTTState *s, NTTLimb *buf1, NTTLimb *buf2, + int k, int k_tot, limb_t m_idx) +{ + limb_t n1, n2, i; + int k1, k2; + + if (k <= NTT_TRIG_K_MAX) { + k1 = k; + } else { + /* recursive split of the FFT */ + k1 = bf_min(k / 2, NTT_TRIG_K_MAX); + } + k2 = k - k1; + n1 = (limb_t)1 << k1; + n2 = (limb_t)1 << k2; + + if (ntt_fft_partial(s, buf1, k1, k2, n1, n2, 0, m_idx)) + return -1; + if (ntt_fft_partial(s, buf2, k1, k2, n1, n2, 0, m_idx)) + return -1; + if (k2 == 0) { + ntt_vec_mul(s, buf1, buf2, k, k_tot, m_idx); + } else { + for(i = 0; i < n1; i++) { + ntt_conv(s, buf1 + i * n2, buf2 + i * n2, k2, k_tot, m_idx); + } + } + if (ntt_fft_partial(s, buf1, k1, k2, n1, n2, 1, m_idx)) + return -1; + return 0; +} + + +static no_inline void limb_to_ntt(BFNTTState *s, + NTTLimb *tabr, limb_t fft_len, + const limb_t *taba, limb_t a_len, int dpl, + int first_m_idx, int nb_mods) +{ + slimb_t i, n; + dlimb_t a, b; + int j, shift; + limb_t base_mask1, a0, a1, a2, r, m, m_inv; + +#if 0 + for(i = 0; i < a_len; i++) { + printf("%" PRId64 ": " FMT_LIMB "\n", + (int64_t)i, taba[i]); + } +#endif + memset(tabr, 0, sizeof(NTTLimb) * fft_len * nb_mods); + shift = dpl & (LIMB_BITS - 1); + if (shift == 0) + base_mask1 = -1; + else + base_mask1 = ((limb_t)1 << shift) - 1; + n = bf_min(fft_len, (a_len * LIMB_BITS + dpl - 1) / dpl); + for(i = 0; i < n; i++) { + a0 = get_bits(taba, a_len, i * dpl); + if (dpl <= LIMB_BITS) { + a0 &= base_mask1; + a = a0; + } else { + a1 = get_bits(taba, a_len, i * dpl + LIMB_BITS); + if (dpl <= (LIMB_BITS + NTT_MOD_LOG2_MIN)) { + a = a0 | ((dlimb_t)(a1 & base_mask1) << LIMB_BITS); + } else { + if (dpl > 2 * LIMB_BITS) { + a2 = get_bits(taba, a_len, i * dpl + LIMB_BITS * 2) & + base_mask1; + } else { + a1 &= base_mask1; + a2 = 0; + } + // printf("a=0x%016lx%016lx%016lx\n", a2, a1, a0); + a = (a0 >> (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) | + ((dlimb_t)a1 << (NTT_MOD_LOG2_MAX - NTT_MOD_LOG2_MIN)) | + ((dlimb_t)a2 << (LIMB_BITS + NTT_MOD_LOG2_MAX - NTT_MOD_LOG2_MIN)); + a0 &= ((limb_t)1 << (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) - 1; + } + } + for(j = 0; j < nb_mods; j++) { + m = ntt_mods[first_m_idx + j]; + m_inv = s->ntt_mods_div[first_m_idx + j]; + r = mod_fast(a, m, m_inv); + if (dpl > (LIMB_BITS + NTT_MOD_LOG2_MIN)) { + b = ((dlimb_t)r << (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) | a0; + r = mod_fast(b, m, m_inv); + } + tabr[i + j * fft_len] = int_to_ntt_limb(r, m); + } + } +} + +#if defined(__AVX2__) + +#define VEC_LEN 4 + +typedef union { + __m256d v; + double d[4]; +} VecUnion; + +static no_inline void ntt_to_limb(BFNTTState *s, limb_t *tabr, limb_t r_len, + const NTTLimb *buf, int fft_len_log2, int dpl, + int nb_mods) +{ + const limb_t *mods = ntt_mods + NB_MODS - nb_mods; + const __m256d *mods_cr_vec, *mf, *m_inv; + VecUnion y[NB_MODS]; + limb_t u[NB_MODS], carry[NB_MODS], fft_len, base_mask1, r; + slimb_t i, len, pos; + int j, k, l, shift, n_limb1, p; + dlimb_t t; + + j = NB_MODS * (NB_MODS - 1) / 2 - nb_mods * (nb_mods - 1) / 2; + mods_cr_vec = s->ntt_mods_cr_vec + j; + mf = s->ntt_mods_vec + NB_MODS - nb_mods; + m_inv = s->ntt_mods_inv_vec + NB_MODS - nb_mods; + + shift = dpl & (LIMB_BITS - 1); + if (shift == 0) + base_mask1 = -1; + else + base_mask1 = ((limb_t)1 << shift) - 1; + n_limb1 = ((unsigned)dpl - 1) / LIMB_BITS; + for(j = 0; j < NB_MODS; j++) + carry[j] = 0; + for(j = 0; j < NB_MODS; j++) + u[j] = 0; /* avoid warnings */ + memset(tabr, 0, sizeof(limb_t) * r_len); + fft_len = (limb_t)1 << fft_len_log2; + len = bf_min(fft_len, (r_len * LIMB_BITS + dpl - 1) / dpl); + len = (len + VEC_LEN - 1) & ~(VEC_LEN - 1); + i = 0; + while (i < len) { + for(j = 0; j < nb_mods; j++) + y[j].v = *(__m256d *)&buf[i + fft_len * j]; + + /* Chinese remainder to get mixed radix representation */ + l = 0; + for(j = 0; j < nb_mods - 1; j++) { + y[j].v = ntt_mod1(y[j].v, mf[j]); + for(k = j + 1; k < nb_mods; k++) { + y[k].v = ntt_mul_mod(y[k].v - y[j].v, + mods_cr_vec[l], mf[k], m_inv[k]); + l++; + } + } + y[j].v = ntt_mod1(y[j].v, mf[j]); + + for(p = 0; p < VEC_LEN; p++) { + /* back to normal representation */ + u[0] = (int64_t)y[nb_mods - 1].d[p]; + l = 1; + for(j = nb_mods - 2; j >= 1; j--) { + r = (int64_t)y[j].d[p]; + for(k = 0; k < l; k++) { + t = (dlimb_t)u[k] * mods[j] + r; + r = t >> LIMB_BITS; + u[k] = t; + } + u[l] = r; + l++; + } + /* XXX: for nb_mods = 5, l should be 4 */ + + /* last step adds the carry */ + r = (int64_t)y[0].d[p]; + for(k = 0; k < l; k++) { + t = (dlimb_t)u[k] * mods[j] + r + carry[k]; + r = t >> LIMB_BITS; + u[k] = t; + } + u[l] = r + carry[l]; + +#if 0 + printf("%" PRId64 ": ", i); + for(j = nb_mods - 1; j >= 0; j--) { + printf(" %019" PRIu64, u[j]); + } + printf("\n"); +#endif + + /* write the digits */ + pos = i * dpl; + for(j = 0; j < n_limb1; j++) { + put_bits(tabr, r_len, pos, u[j]); + pos += LIMB_BITS; + } + put_bits(tabr, r_len, pos, u[n_limb1] & base_mask1); + /* shift by dpl digits and set the carry */ + if (shift == 0) { + for(j = n_limb1 + 1; j < nb_mods; j++) + carry[j - (n_limb1 + 1)] = u[j]; + } else { + for(j = n_limb1; j < nb_mods - 1; j++) { + carry[j - n_limb1] = (u[j] >> shift) | + (u[j + 1] << (LIMB_BITS - shift)); + } + carry[nb_mods - 1 - n_limb1] = u[nb_mods - 1] >> shift; + } + i++; + } + } +} +#else +static no_inline void ntt_to_limb(BFNTTState *s, limb_t *tabr, limb_t r_len, + const NTTLimb *buf, int fft_len_log2, int dpl, + int nb_mods) +{ + const limb_t *mods = ntt_mods + NB_MODS - nb_mods; + const limb_t *mods_cr, *mods_cr_inv; + limb_t y[NB_MODS], u[NB_MODS], carry[NB_MODS], fft_len, base_mask1, r; + slimb_t i, len, pos; + int j, k, l, shift, n_limb1; + dlimb_t t; + + j = NB_MODS * (NB_MODS - 1) / 2 - nb_mods * (nb_mods - 1) / 2; + mods_cr = ntt_mods_cr + j; + mods_cr_inv = s->ntt_mods_cr_inv + j; + + shift = dpl & (LIMB_BITS - 1); + if (shift == 0) + base_mask1 = -1; + else + base_mask1 = ((limb_t)1 << shift) - 1; + n_limb1 = ((unsigned)dpl - 1) / LIMB_BITS; + for(j = 0; j < NB_MODS; j++) + carry[j] = 0; + for(j = 0; j < NB_MODS; j++) + u[j] = 0; /* avoid warnings */ + memset(tabr, 0, sizeof(limb_t) * r_len); + fft_len = (limb_t)1 << fft_len_log2; + len = bf_min(fft_len, (r_len * LIMB_BITS + dpl - 1) / dpl); + for(i = 0; i < len; i++) { + for(j = 0; j < nb_mods; j++) { + y[j] = ntt_limb_to_int(buf[i + fft_len * j], mods[j]); + } + + /* Chinese remainder to get mixed radix representation */ + l = 0; + for(j = 0; j < nb_mods - 1; j++) { + for(k = j + 1; k < nb_mods; k++) { + limb_t m; + m = mods[k]; + /* Note: there is no overflow in the sub_mod() because + the modulos are sorted by increasing order */ + y[k] = mul_mod_fast2(y[k] - y[j] + m, + mods_cr[l], m, mods_cr_inv[l]); + l++; + } + } + + /* back to normal representation */ + u[0] = y[nb_mods - 1]; + l = 1; + for(j = nb_mods - 2; j >= 1; j--) { + r = y[j]; + for(k = 0; k < l; k++) { + t = (dlimb_t)u[k] * mods[j] + r; + r = t >> LIMB_BITS; + u[k] = t; + } + u[l] = r; + l++; + } + + /* last step adds the carry */ + r = y[0]; + for(k = 0; k < l; k++) { + t = (dlimb_t)u[k] * mods[j] + r + carry[k]; + r = t >> LIMB_BITS; + u[k] = t; + } + u[l] = r + carry[l]; + +#if 0 + printf("%" PRId64 ": ", (int64_t)i); + for(j = nb_mods - 1; j >= 0; j--) { + printf(" " FMT_LIMB, u[j]); + } + printf("\n"); +#endif + + /* write the digits */ + pos = i * dpl; + for(j = 0; j < n_limb1; j++) { + put_bits(tabr, r_len, pos, u[j]); + pos += LIMB_BITS; + } + put_bits(tabr, r_len, pos, u[n_limb1] & base_mask1); + /* shift by dpl digits and set the carry */ + if (shift == 0) { + for(j = n_limb1 + 1; j < nb_mods; j++) + carry[j - (n_limb1 + 1)] = u[j]; + } else { + for(j = n_limb1; j < nb_mods - 1; j++) { + carry[j - n_limb1] = (u[j] >> shift) | + (u[j + 1] << (LIMB_BITS - shift)); + } + carry[nb_mods - 1 - n_limb1] = u[nb_mods - 1] >> shift; + } + } +} +#endif + +static int ntt_static_init(bf_context_t *s1) +{ + BFNTTState *s; + int inverse, i, j, k, l; + limb_t c, c_inv, c_inv2, m, m_inv; + + if (s1->ntt_state) + return 0; +#if defined(__AVX2__) + s = bf_aligned_malloc(s1, sizeof(*s), 64); +#else + s = bf_malloc(s1, sizeof(*s)); +#endif + if (!s) + return -1; + memset(s, 0, sizeof(*s)); + s1->ntt_state = s; + s->ctx = s1; + + for(j = 0; j < NB_MODS; j++) { + m = ntt_mods[j]; + m_inv = init_mul_mod_fast(m); + s->ntt_mods_div[j] = m_inv; +#if defined(__AVX2__) + s->ntt_mods_vec[j] = _mm256_set1_pd(m); + s->ntt_mods_inv_vec[j] = _mm256_set1_pd(1.0 / (double)m); +#endif + c_inv2 = (m + 1) / 2; /* 1/2 */ + c_inv = 1; + for(i = 0; i <= NTT_PROOT_2EXP; i++) { + s->ntt_len_inv[j][i][0] = c_inv; + s->ntt_len_inv[j][i][1] = init_mul_mod_fast2(c_inv, m); + c_inv = mul_mod_fast(c_inv, c_inv2, m, m_inv); + } + + for(inverse = 0; inverse < 2; inverse++) { + c = ntt_proot[inverse][j]; + for(i = 0; i < NTT_PROOT_2EXP; i++) { + s->ntt_proot_pow[j][inverse][NTT_PROOT_2EXP - i] = c; + s->ntt_proot_pow_inv[j][inverse][NTT_PROOT_2EXP - i] = + init_mul_mod_fast2(c, m); + c = mul_mod_fast(c, c, m, m_inv); + } + } + } + + l = 0; + for(j = 0; j < NB_MODS - 1; j++) { + for(k = j + 1; k < NB_MODS; k++) { +#if defined(__AVX2__) + s->ntt_mods_cr_vec[l] = _mm256_set1_pd(int_to_ntt_limb2(ntt_mods_cr[l], + ntt_mods[k])); +#else + s->ntt_mods_cr_inv[l] = init_mul_mod_fast2(ntt_mods_cr[l], + ntt_mods[k]); +#endif + l++; + } + } + return 0; +} + +int bf_get_fft_size(int *pdpl, int *pnb_mods, limb_t len) +{ + int dpl, fft_len_log2, n_bits, nb_mods, dpl_found, fft_len_log2_found; + int int_bits, nb_mods_found; + limb_t cost, min_cost; + + min_cost = -1; + dpl_found = 0; + nb_mods_found = 4; + fft_len_log2_found = 0; + for(nb_mods = 3; nb_mods <= NB_MODS; nb_mods++) { + int_bits = ntt_int_bits[NB_MODS - nb_mods]; + dpl = bf_min((int_bits - 4) / 2, + 2 * LIMB_BITS + 2 * NTT_MOD_LOG2_MIN - NTT_MOD_LOG2_MAX); + for(;;) { + fft_len_log2 = ceil_log2((len * LIMB_BITS + dpl - 1) / dpl); + if (fft_len_log2 > NTT_PROOT_2EXP) + goto next; + n_bits = fft_len_log2 + 2 * dpl; + if (n_bits <= int_bits) { + cost = ((limb_t)(fft_len_log2 + 1) << fft_len_log2) * nb_mods; + // printf("n=%d dpl=%d: cost=%" PRId64 "\n", nb_mods, dpl, (int64_t)cost); + if (cost < min_cost) { + min_cost = cost; + dpl_found = dpl; + nb_mods_found = nb_mods; + fft_len_log2_found = fft_len_log2; + } + break; + } + dpl--; + if (dpl == 0) + break; + } + next: ; + } + if (!dpl_found) + abort(); + /* limit dpl if possible to reduce fixed cost of limb/NTT conversion */ + if (dpl_found > (LIMB_BITS + NTT_MOD_LOG2_MIN) && + ((limb_t)(LIMB_BITS + NTT_MOD_LOG2_MIN) << fft_len_log2_found) >= + len * LIMB_BITS) { + dpl_found = LIMB_BITS + NTT_MOD_LOG2_MIN; + } + *pnb_mods = nb_mods_found; + *pdpl = dpl_found; + return fft_len_log2_found; +} + +/* return 0 if OK, -1 if memory error */ +static no_inline int fft_mul(bf_context_t *s1, + bf_t *res, limb_t *a_tab, limb_t a_len, + limb_t *b_tab, limb_t b_len, int mul_flags) +{ + BFNTTState *s; + int dpl, fft_len_log2, j, nb_mods, reduced_mem; + slimb_t len, fft_len; + NTTLimb *buf1, *buf2, *ptr; +#if defined(USE_MUL_CHECK) + limb_t ha, hb, hr, h_ref; +#endif + + if (ntt_static_init(s1)) + return -1; + s = s1->ntt_state; + + /* find the optimal number of digits per limb (dpl) */ + len = a_len + b_len; + fft_len_log2 = bf_get_fft_size(&dpl, &nb_mods, len); + fft_len = (uint64_t)1 << fft_len_log2; + // printf("len=%" PRId64 " fft_len_log2=%d dpl=%d\n", len, fft_len_log2, dpl); +#if defined(USE_MUL_CHECK) + ha = mp_mod1(a_tab, a_len, BF_CHKSUM_MOD, 0); + hb = mp_mod1(b_tab, b_len, BF_CHKSUM_MOD, 0); +#endif + if ((mul_flags & (FFT_MUL_R_OVERLAP_A | FFT_MUL_R_OVERLAP_B)) == 0) { + if (!(mul_flags & FFT_MUL_R_NORESIZE)) + bf_resize(res, 0); + } else if (mul_flags & FFT_MUL_R_OVERLAP_B) { + limb_t *tmp_tab, tmp_len; + /* it is better to free 'b' first */ + tmp_tab = a_tab; + a_tab = b_tab; + b_tab = tmp_tab; + tmp_len = a_len; + a_len = b_len; + b_len = tmp_len; + } + buf1 = ntt_malloc(s, sizeof(NTTLimb) * fft_len * nb_mods); + if (!buf1) + return -1; + limb_to_ntt(s, buf1, fft_len, a_tab, a_len, dpl, + NB_MODS - nb_mods, nb_mods); + if ((mul_flags & (FFT_MUL_R_OVERLAP_A | FFT_MUL_R_OVERLAP_B)) == + FFT_MUL_R_OVERLAP_A) { + if (!(mul_flags & FFT_MUL_R_NORESIZE)) + bf_resize(res, 0); + } + reduced_mem = (fft_len_log2 >= 14); + if (!reduced_mem) { + buf2 = ntt_malloc(s, sizeof(NTTLimb) * fft_len * nb_mods); + if (!buf2) + goto fail; + limb_to_ntt(s, buf2, fft_len, b_tab, b_len, dpl, + NB_MODS - nb_mods, nb_mods); + if (!(mul_flags & FFT_MUL_R_NORESIZE)) + bf_resize(res, 0); /* in case res == b */ + } else { + buf2 = ntt_malloc(s, sizeof(NTTLimb) * fft_len); + if (!buf2) + goto fail; + } + for(j = 0; j < nb_mods; j++) { + if (reduced_mem) { + limb_to_ntt(s, buf2, fft_len, b_tab, b_len, dpl, + NB_MODS - nb_mods + j, 1); + ptr = buf2; + } else { + ptr = buf2 + fft_len * j; + } + if (ntt_conv(s, buf1 + fft_len * j, ptr, + fft_len_log2, fft_len_log2, j + NB_MODS - nb_mods)) + goto fail; + } + if (!(mul_flags & FFT_MUL_R_NORESIZE)) + bf_resize(res, 0); /* in case res == b and reduced mem */ + ntt_free(s, buf2); + buf2 = NULL; + if (!(mul_flags & FFT_MUL_R_NORESIZE)) { + if (bf_resize(res, len)) + goto fail; + } + ntt_to_limb(s, res->tab, len, buf1, fft_len_log2, dpl, nb_mods); + ntt_free(s, buf1); +#if defined(USE_MUL_CHECK) + hr = mp_mod1(res->tab, len, BF_CHKSUM_MOD, 0); + h_ref = mul_mod(ha, hb, BF_CHKSUM_MOD); + if (hr != h_ref) { + printf("ntt_mul_error: len=%" PRId_LIMB " fft_len_log2=%d dpl=%d nb_mods=%d\n", + len, fft_len_log2, dpl, nb_mods); + // printf("ha=0x" FMT_LIMB" hb=0x" FMT_LIMB " hr=0x" FMT_LIMB " expected=0x" FMT_LIMB "\n", ha, hb, hr, h_ref); + exit(1); + } +#endif + return 0; + fail: + ntt_free(s, buf1); + ntt_free(s, buf2); + return -1; +} + +#else /* USE_FFT_MUL */ + +int bf_get_fft_size(int *pdpl, int *pnb_mods, limb_t len) +{ + return 0; +} + +#endif /* !USE_FFT_MUL */ |