Update QuickJS-NG to 0.10.0

author: bptato <nincsnevem662@gmail.com> 2025-05-04 17:00:49 +0200
committer: bptato <nincsnevem662@gmail.com> 2025-05-04 17:06:47 +0200
commit: bee4f12b0348e8893d62a01ce027b1550bb6ef09 (patch)
tree: cccddd33c62ed0e8a0487944648331aa06a95a93
parent: 9b184b31eb916b013a4d501a5d1e9ff8460d3fee (diff)
download: chawan-bee4f12b0348e8893d62a01ce027b1550bb6ef09.tar.gz
16 files changed, 4245 insertions, 9800 deletions
diff --git a/lib/monoucha0/monoucha/libregexp.nim b/lib/monoucha0/monoucha/libregexp.nim
index a3d0506f..7810f418 100644
--- a/lib/monoucha0/monoucha/libregexp.nim
+++ b/lib/monoucha0/monoucha/libregexp.nim
@@ -28,6 +28,11 @@ proc lre_realloc(opaque, p: pointer; size: csize_t): pointer {.exportc.} =
 # compilation pass" (i.e. in C).
 {.emit: """
 #ifndef NOT_LRE_ONLY
+int *lre_check_timeout(void *opaque)
+{
+  return 0;
+}
+
 bool lre_check_stack_overflow(void *opaque, size_t alloca_size)
 {
   return 0;
diff --git a/lib/monoucha0/monoucha/qjs/builtin-array-fromasync.h b/lib/monoucha0/monoucha/qjs/builtin-array-fromasync.h
new file mode 100644
index 00000000..baaa8687
--- /dev/null
+++ b/lib/monoucha0/monoucha/qjs/builtin-array-fromasync.h
@@ -0,0 +1,113 @@
+/* File generated automatically by the QuickJS-ng compiler. */
+
+#include <inttypes.h>
+
+const uint32_t qjsc_builtin_array_fromasync_size = 826;
+
+const uint8_t qjsc_builtin_array_fromasync[826] = {
+ 0x14, 0x0d, 0x01, 0x1a, 0x61, 0x73, 0x79, 0x6e,
+ 0x63, 0x49, 0x74, 0x65, 0x72, 0x61, 0x74, 0x6f,
+ 0x72, 0x01, 0x10, 0x69, 0x74, 0x65, 0x72, 0x61,
+ 0x74, 0x6f, 0x72, 0x01, 0x12, 0x61, 0x72, 0x72,
+ 0x61, 0x79, 0x4c, 0x69, 0x6b, 0x65, 0x01, 0x0a,
+ 0x6d, 0x61, 0x70, 0x46, 0x6e, 0x01, 0x0e, 0x74,
+ 0x68, 0x69, 0x73, 0x41, 0x72, 0x67, 0x01, 0x0c,
+ 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x01, 0x02,
+ 0x69, 0x01, 0x1a, 0x69, 0x73, 0x43, 0x6f, 0x6e,
+ 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72,
+ 0x01, 0x08, 0x73, 0x79, 0x6e, 0x63, 0x01, 0x0c,
+ 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x01, 0x08,
+ 0x69, 0x74, 0x65, 0x72, 0x01, 0x1c, 0x6e, 0x6f,
+ 0x74, 0x20, 0x61, 0x20, 0x66, 0x75, 0x6e, 0x63,
+ 0x74, 0x69, 0x6f, 0x6e, 0x01, 0x08, 0x63, 0x61,
+ 0x6c, 0x6c, 0x0c, 0x00, 0x02, 0x00, 0xa2, 0x01,
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x04, 0x01,
+ 0xa4, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x43, 0x02,
+ 0x01, 0x00, 0x05, 0x00, 0x05, 0x01, 0x00, 0x01,
+ 0x03, 0x05, 0xaa, 0x02, 0x00, 0x01, 0x40, 0x9e,
+ 0x03, 0x00, 0x01, 0x40, 0xc2, 0x03, 0x00, 0x01,
+ 0x40, 0xcc, 0x01, 0x00, 0x01, 0x40, 0xc4, 0x03,
+ 0x00, 0x01, 0x40, 0x0c, 0x60, 0x02, 0x01, 0xf8,
+ 0x01, 0x03, 0x0e, 0x01, 0x06, 0x05, 0x00, 0x86,
+ 0x04, 0x11, 0xc6, 0x03, 0x00, 0x01, 0x00, 0xc8,
+ 0x03, 0x00, 0x01, 0x00, 0xca, 0x03, 0x00, 0x01,
+ 0x00, 0xc6, 0x03, 0x01, 0xff, 0xff, 0xff, 0xff,
+ 0x0f, 0x20, 0xc8, 0x03, 0x01, 0x01, 0x20, 0xca,
+ 0x03, 0x01, 0x02, 0x20, 0xcc, 0x03, 0x02, 0x00,
+ 0x20, 0xce, 0x03, 0x02, 0x04, 0x20, 0xd0, 0x03,
+ 0x02, 0x05, 0x20, 0xd2, 0x03, 0x02, 0x06, 0x20,
+ 0xd4, 0x03, 0x02, 0x07, 0x20, 0x64, 0x06, 0x08,
+ 0x20, 0x82, 0x01, 0x07, 0x09, 0x20, 0xd6, 0x03,
+ 0x0a, 0x08, 0x30, 0x82, 0x01, 0x0d, 0x0b, 0x20,
+ 0xd4, 0x01, 0x0d, 0x0c, 0x20, 0x10, 0x00, 0x01,
+ 0x00, 0x9e, 0x03, 0x01, 0x03, 0xc2, 0x03, 0x02,
+ 0x03, 0xc4, 0x03, 0x04, 0x03, 0xaa, 0x02, 0x00,
+ 0x03, 0xcc, 0x01, 0x03, 0x03, 0x08, 0xc4, 0x0d,
+ 0x62, 0x02, 0x00, 0x62, 0x01, 0x00, 0x62, 0x00,
+ 0x00, 0xd3, 0xcb, 0xd4, 0x11, 0xf4, 0xec, 0x08,
+ 0x0e, 0x39, 0x46, 0x00, 0x00, 0x00, 0xdc, 0xcc,
+ 0xd5, 0x11, 0xf4, 0xec, 0x08, 0x0e, 0x39, 0x46,
+ 0x00, 0x00, 0x00, 0xdd, 0xcd, 0x62, 0x07, 0x00,
+ 0x62, 0x06, 0x00, 0x62, 0x05, 0x00, 0x62, 0x04,
+ 0x00, 0x62, 0x03, 0x00, 0xd4, 0x39, 0x46, 0x00,
+ 0x00, 0x00, 0xb0, 0xec, 0x16, 0xd4, 0x98, 0x04,
+ 0x1b, 0x00, 0x00, 0x00, 0xb0, 0xec, 0x0c, 0xdf,
+ 0x11, 0x04, 0xec, 0x00, 0x00, 0x00, 0x21, 0x01,
+ 0x00, 0x30, 0x06, 0xce, 0xb6, 0xc4, 0x04, 0xc3,
+ 0x0d, 0xf7, 0xc4, 0x05, 0x09, 0xc4, 0x06, 0xd3,
+ 0xe0, 0x48, 0xc4, 0x07, 0x63, 0x07, 0x00, 0x07,
+ 0xad, 0xec, 0x0f, 0x0a, 0x11, 0x64, 0x06, 0x00,
+ 0x0e, 0xd3, 0xe1, 0x48, 0x11, 0x64, 0x07, 0x00,
+ 0x0e, 0x63, 0x07, 0x00, 0x07, 0xad, 0x6a, 0xa6,
+ 0x00, 0x00, 0x00, 0x62, 0x08, 0x00, 0x06, 0x11,
+ 0xf4, 0xed, 0x0c, 0x71, 0x43, 0x32, 0x00, 0x00,
+ 0x00, 0xc4, 0x08, 0x0e, 0xee, 0x05, 0x0e, 0xd3,
+ 0xee, 0xf2, 0x63, 0x08, 0x00, 0x8e, 0x11, 0xed,
+ 0x03, 0x0e, 0xb6, 0x11, 0x64, 0x08, 0x00, 0x0e,
+ 0x63, 0x05, 0x00, 0xec, 0x0c, 0xc3, 0x0d, 0x11,
+ 0x63, 0x08, 0x00, 0x21, 0x01, 0x00, 0xee, 0x06,
+ 0xe2, 0x63, 0x08, 0x00, 0xf1, 0x11, 0x64, 0x03,
+ 0x00, 0x0e, 0x63, 0x04, 0x00, 0x63, 0x08, 0x00,
+ 0xa7, 0x6a, 0x2a, 0x01, 0x00, 0x00, 0x62, 0x09,
+ 0x00, 0xd3, 0x63, 0x04, 0x00, 0x48, 0xc4, 0x09,
+ 0x63, 0x06, 0x00, 0xec, 0x0a, 0x63, 0x09, 0x00,
+ 0x8c, 0x11, 0x64, 0x09, 0x00, 0x0e, 0xd4, 0xec,
+ 0x17, 0xd4, 0x43, 0xed, 0x00, 0x00, 0x00, 0xd5,
+ 0x63, 0x09, 0x00, 0x63, 0x04, 0x00, 0x24, 0x03,
+ 0x00, 0x8c, 0x11, 0x64, 0x09, 0x00, 0x0e, 0x5f,
+ 0x04, 0x00, 0x63, 0x03, 0x00, 0x63, 0x04, 0x00,
+ 0x92, 0x64, 0x04, 0x00, 0x0b, 0x63, 0x09, 0x00,
+ 0x4d, 0x41, 0x00, 0x00, 0x00, 0x0a, 0x4d, 0x3e,
+ 0x00, 0x00, 0x00, 0x0a, 0x4d, 0x3f, 0x00, 0x00,
+ 0x00, 0xf3, 0x0e, 0xee, 0x9e, 0x62, 0x0a, 0x00,
+ 0x63, 0x07, 0x00, 0x43, 0xed, 0x00, 0x00, 0x00,
+ 0xd3, 0x24, 0x01, 0x00, 0xc4, 0x0a, 0x63, 0x05,
+ 0x00, 0xec, 0x09, 0xc3, 0x0d, 0x11, 0x21, 0x00,
+ 0x00, 0xee, 0x03, 0xe2, 0xf0, 0x11, 0x64, 0x03,
+ 0x00, 0x0e, 0x6d, 0x8c, 0x00, 0x00, 0x00, 0x62,
+ 0x0c, 0x00, 0x62, 0x0b, 0x00, 0x06, 0x11, 0xf4,
+ 0xed, 0x13, 0x71, 0x43, 0x41, 0x00, 0x00, 0x00,
+ 0xc4, 0x0b, 0x43, 0x6a, 0x00, 0x00, 0x00, 0xc4,
+ 0x0c, 0x0e, 0xee, 0x10, 0x0e, 0x63, 0x0a, 0x00,
+ 0x43, 0x6b, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00,
+ 0x8c, 0xee, 0xe0, 0x63, 0x0c, 0x00, 0xed, 0x4e,
+ 0x63, 0x06, 0x00, 0xec, 0x0a, 0x63, 0x0b, 0x00,
+ 0x8c, 0x11, 0x64, 0x0b, 0x00, 0x0e, 0xd4, 0xec,
+ 0x17, 0xd4, 0x43, 0xed, 0x00, 0x00, 0x00, 0xd5,
+ 0x63, 0x0b, 0x00, 0x63, 0x04, 0x00, 0x24, 0x03,
+ 0x00, 0x8c, 0x11, 0x64, 0x0b, 0x00, 0x0e, 0x5f,
+ 0x04, 0x00, 0x63, 0x03, 0x00, 0x63, 0x04, 0x00,
+ 0x92, 0x64, 0x04, 0x00, 0x0b, 0x63, 0x0b, 0x00,
+ 0x4d, 0x41, 0x00, 0x00, 0x00, 0x0a, 0x4d, 0x3e,
+ 0x00, 0x00, 0x00, 0x0a, 0x4d, 0x3f, 0x00, 0x00,
+ 0x00, 0xf3, 0x0e, 0xee, 0x83, 0x0e, 0x06, 0x6e,
+ 0x0d, 0x00, 0x00, 0x00, 0x0e, 0xee, 0x1e, 0x6e,
+ 0x05, 0x00, 0x00, 0x00, 0x30, 0x63, 0x0a, 0x00,
+ 0x42, 0x06, 0x00, 0x00, 0x00, 0xec, 0x0d, 0x63,
+ 0x0a, 0x00, 0x43, 0x06, 0x00, 0x00, 0x00, 0x24,
+ 0x00, 0x00, 0x0e, 0x6f, 0x63, 0x03, 0x00, 0x63,
+ 0x04, 0x00, 0x44, 0x32, 0x00, 0x00, 0x00, 0x63,
+ 0x03, 0x00, 0x2f, 0xc1, 0x00, 0x28, 0xc1, 0x00,
+ 0xcf, 0x28,
+};
+
diff --git a/lib/monoucha0/monoucha/qjs/cutils.c b/lib/monoucha0/monoucha/qjs/cutils.c
index a7b80502..5ad14eba 100644
--- a/lib/monoucha0/monoucha/qjs/cutils.c
+++ b/lib/monoucha0/monoucha/qjs/cutils.c
@@ -31,6 +31,13 @@
 #if !defined(_MSC_VER)
 #include <sys/time.h>
 #endif
+#if defined(_WIN32)
+#include <windows.h>
+#include <process.h> // _beginthread
+#endif
+#if defined(__APPLE__)
+#include <mach-o/dyld.h>
+#endif
 
 #include "cutils.h"
 
@@ -1197,10 +1204,112 @@ int64_t js__gettimeofday_us(void) {
     return ((int64_t)tv.tv_sec * 1000000) + tv.tv_usec;
 }
 
-/*--- Cross-platform threading APIs. ----*/
+#if defined(_WIN32)
+int js_exepath(char *buffer, size_t *size_ptr) {
+    int utf8_len, utf16_buffer_len, utf16_len;
+    WCHAR* utf16_buffer;
+
+    if (buffer == NULL || size_ptr == NULL || *size_ptr == 0)
+      return -1;
+
+    if (*size_ptr > 32768) {
+      /* Windows paths can never be longer than this. */
+      utf16_buffer_len = 32768;
+    } else {
+      utf16_buffer_len = (int)*size_ptr;
+    }
+
+    utf16_buffer = malloc(sizeof(WCHAR) * utf16_buffer_len);
+    if (!utf16_buffer)
+        return -1;
+
+    /* Get the path as UTF-16. */
+    utf16_len = GetModuleFileNameW(NULL, utf16_buffer, utf16_buffer_len);
+    if (utf16_len <= 0)
+      goto error;
+
+    /* Convert to UTF-8 */
+    utf8_len = WideCharToMultiByte(CP_UTF8,
+                                   0,
+                                   utf16_buffer,
+                                   -1,
+                                   buffer,
+                                   (int)*size_ptr,
+                                   NULL,
+                                   NULL);
+    if (utf8_len == 0)
+      goto error;
+
+    free(utf16_buffer);
+
+    /* utf8_len *does* include the terminating null at this point, but the
+     * returned size shouldn't. */
+    *size_ptr = utf8_len - 1;
+    return 0;
+
+error:
+    free(utf16_buffer);
+    return -1;
+}
+#elif defined(__APPLE__)
+int js_exepath(char *buffer, size_t *size) {
+    /* realpath(exepath) may be > PATH_MAX so double it to be on the safe side. */
+    char abspath[PATH_MAX * 2 + 1];
+    char exepath[PATH_MAX + 1];
+    uint32_t exepath_size;
+    size_t abspath_size;
+
+    if (buffer == NULL || size == NULL || *size == 0)
+        return -1;
+
+    exepath_size = sizeof(exepath);
+    if (_NSGetExecutablePath(exepath, &exepath_size))
+        return -1;
+
+    if (realpath(exepath, abspath) != abspath)
+        return -1;
+
+    abspath_size = strlen(abspath);
+    if (abspath_size == 0)
+        return -1;
+
+    *size -= 1;
+    if (*size > abspath_size)
+        *size = abspath_size;
+
+    memcpy(buffer, abspath, *size);
+    buffer[*size] = '\0';
+
+    return 0;
+}
+#elif defined(__linux__)
+int js_exepath(char *buffer, size_t *size) {
+    ssize_t n;
+
+    if (buffer == NULL || size == NULL || *size == 0)
+        return -1;
+
+    n = *size - 1;
+    if (n > 0)
+        n = readlink("/proc/self/exe", buffer, n);
 
-#if !defined(EMSCRIPTEN) && !defined(__wasi__) && !defined(MNC_NO_THREADS)
+    if (n == -1)
+        return n;
 
+    buffer[n] = '\0';
+    *size = n;
+
+    return 0;
+}
+#else
+int js_exepath(char* buffer, size_t* size_ptr) {
+    return -1;
+}
+#endif
+
+/*--- Cross-platform threading APIs. ----*/
+
+#if JS_HAVE_THREADS
 #if defined(_WIN32)
 typedef void (*js__once_cb)(void);
 
@@ -1267,6 +1376,37 @@ int js_cond_timedwait(js_cond_t *cond, js_mutex_t *mutex, uint64_t timeout) {
     return -1;
 }
 
+int js_thread_create(js_thread_t *thrd, void (*start)(void *), void *arg,
+                     int flags)
+{
+    HANDLE h, cp;
+
+    *thrd = INVALID_HANDLE_VALUE;
+    if (flags & ~JS_THREAD_CREATE_DETACHED)
+        return -1;
+    h = (HANDLE)_beginthread(start, /*stacksize*/2<<20, arg);
+    if (!h)
+        return -1;
+    if (flags & JS_THREAD_CREATE_DETACHED)
+        return 0;
+    // _endthread() automatically closes the handle but we want to wait on
+    // it so make a copy. Race-y for very short-lived threads. Can be solved
+    // by switching to _beginthreadex(CREATE_SUSPENDED) but means changing
+    // |start| from __cdecl to __stdcall.
+    cp = GetCurrentProcess();
+    if (DuplicateHandle(cp, h, cp, thrd, 0, FALSE, DUPLICATE_SAME_ACCESS))
+        return 0;
+    return -1;
+}
+
+int js_thread_join(js_thread_t thrd)
+{
+    if (WaitForSingleObject(thrd, INFINITE))
+        return -1;
+    CloseHandle(thrd);
+    return 0;
+}
+
 #else /* !defined(_WIN32) */
 
 void js_once(js_once_t *guard, void (*callback)(void)) {
@@ -1407,9 +1547,43 @@ int js_cond_timedwait(js_cond_t *cond, js_mutex_t *mutex, uint64_t timeout) {
     return -1;
 }
 
-#endif
+int js_thread_create(js_thread_t *thrd, void (*start)(void *), void *arg,
+                     int flags)
+{
+    union {
+        void (*x)(void *);
+        void *(*f)(void *);
+    } u = {start};
+    pthread_attr_t attr;
+    int ret;
+
+    if (flags & ~JS_THREAD_CREATE_DETACHED)
+        return -1;
+    if (pthread_attr_init(&attr))
+        return -1;
+    ret = -1;
+    if (pthread_attr_setstacksize(&attr, 2<<20))
+        goto fail;
+    if (flags & JS_THREAD_CREATE_DETACHED)
+        if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
+            goto fail;
+    if (pthread_create(thrd, &attr, u.f, arg))
+        goto fail;
+    ret = 0;
+fail:
+    pthread_attr_destroy(&attr);
+    return ret;
+}
+
+int js_thread_join(js_thread_t thrd)
+{
+    if (pthread_join(thrd, NULL))
+        return -1;
+    return 0;
+}
 
-#endif /* !defined(EMSCRIPTEN) && !defined(__wasi__) */
+#endif /* !defined(_WIN32) */
+#endif /* JS_HAVE_THREADS */
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
diff --git a/lib/monoucha0/monoucha/qjs/cutils.h b/lib/monoucha0/monoucha/qjs/cutils.h
index c0537e68..e6c7e7b8 100644
--- a/lib/monoucha0/monoucha/qjs/cutils.h
+++ b/lib/monoucha0/monoucha/qjs/cutils.h
@@ -54,6 +54,10 @@ extern "C" {
 #include <errno.h>
 #include <pthread.h>
 #endif
+#if !defined(_WIN32)
+#include <limits.h>
+#include <unistd.h>
+#endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
 #  define likely(x)       (x)
@@ -94,7 +98,7 @@ extern "C" {
 #define container_of(ptr, type, member) ((type *)((uint8_t *)(ptr) - offsetof(type, member)))
 #endif
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__cplusplus)
 #define minimum_length(n) n
 #else
 #define minimum_length(n) static n
@@ -118,6 +122,14 @@ extern "C" {
 #endif
 #endif
 
+#if defined(PATH_MAX)
+# define JS__PATH_MAX PATH_MAX
+#elif defined(_WIN32)
+# define JS__PATH_MAX 32767
+#else
+# define JS__PATH_MAX 8192
+#endif
+
 void js__pstrcpy(char *buf, int buf_size, const char *str);
 char *js__pstrcat(char *buf, int buf_size, const char *s);
 int js__strstart(const char *str, const char *val, const char **ptr);
@@ -545,6 +557,26 @@ void rqsort(void *base, size_t nmemb, size_t size,
             int (*cmp)(const void *, const void *, void *),
             void *arg);
 
+static inline uint64_t float64_as_uint64(double d)
+{
+    union {
+        double d;
+        uint64_t u64;
+    } u;
+    u.d = d;
+    return u.u64;
+}
+
+static inline double uint64_as_float64(uint64_t u64)
+{
+    union {
+        double d;
+        uint64_t u64;
+    } u;
+    u.u64 = u64;
+    return u.d;
+}
+
 int64_t js__gettimeofday_us(void);
 uint64_t js__hrtime_ns(void);
 
@@ -561,20 +593,30 @@ static inline size_t js__malloc_usable_size(const void *ptr)
 #endif
 }
 
+int js_exepath(char* buffer, size_t* size);
+
 /* Cross-platform threading APIs. */
 
-#if !defined(EMSCRIPTEN) && !defined(__wasi__) && !defined(MNC_NO_THREADS)
+#if defined(EMSCRIPTEN) || defined(__wasi__) || defined(MNC_NO_THREADS)
+
+#define JS_HAVE_THREADS 0
+
+#else
+
+#define JS_HAVE_THREADS 1
 
 #if defined(_WIN32)
 #define JS_ONCE_INIT INIT_ONCE_STATIC_INIT
 typedef INIT_ONCE js_once_t;
 typedef CRITICAL_SECTION js_mutex_t;
 typedef CONDITION_VARIABLE js_cond_t;
+typedef HANDLE js_thread_t;
 #else
 #define JS_ONCE_INIT PTHREAD_ONCE_INIT
 typedef pthread_once_t js_once_t;
 typedef pthread_mutex_t js_mutex_t;
 typedef pthread_cond_t js_cond_t;
+typedef pthread_t js_thread_t;
 #endif
 
 void js_once(js_once_t *guard, void (*callback)(void));
@@ -591,6 +633,15 @@ void js_cond_broadcast(js_cond_t *cond);
 void js_cond_wait(js_cond_t *cond, js_mutex_t *mutex);
 int js_cond_timedwait(js_cond_t *cond, js_mutex_t *mutex, uint64_t timeout);
 
+enum {
+    JS_THREAD_CREATE_DETACHED = 1,
+};
+
+// creates threads with 2 MB stacks (glibc default)
+int js_thread_create(js_thread_t *thrd, void (*start)(void *), void *arg,
+                     int flags);
+int js_thread_join(js_thread_t thrd);
+
 #endif /* !defined(EMSCRIPTEN) && !defined(__wasi__) */
 
 #ifdef __cplusplus
diff --git a/lib/monoucha0/monoucha/qjs/libbf.c b/lib/monoucha0/monoucha/qjs/libbf.c
deleted file mode 100644
index 26872c6d..00000000
--- a/lib/monoucha0/monoucha/qjs/libbf.c
+++ /dev/null
@@ -1,8422 +0,0 @@
-/*
- * Tiny arbitrary precision floating point library
- *
- * Copyright (c) 2017-2021 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <inttypes.h>
-#include <math.h>
-#include <string.h>
-#include <assert.h>
-
-#ifdef __AVX2__
-#include <immintrin.h>
-#endif
-
-#include "cutils.h"
-#include "libbf.h"
-
-/* enable it to check the multiplication result */
-//#define USE_MUL_CHECK
-/* enable it to use FFT/NTT multiplication */
-#define USE_FFT_MUL
-/* enable decimal floating point support */
-#define USE_BF_DEC
-
-//#define inline __attribute__((always_inline))
-
-#ifdef __AVX2__
-#define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */
-#else
-#define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */
-#endif
-
-/* XXX: adjust */
-#define DIVNORM_LARGE_THRESHOLD 50
-#define UDIV1NORM_THRESHOLD 3
-
-#if LIMB_BITS == 64
-#define FMT_LIMB1 "%" PRIx64
-#define FMT_LIMB "%016" PRIx64
-#define PRId_LIMB PRId64
-#define PRIu_LIMB PRIu64
-
-#else
-
-#define FMT_LIMB1 "%x"
-#define FMT_LIMB "%08x"
-#define PRId_LIMB "d"
-#define PRIu_LIMB "u"
-
-#endif
-
-typedef intptr_t mp_size_t;
-
-typedef int bf_op2_func_t(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-                          bf_flags_t flags);
-
-#ifdef USE_FFT_MUL
-
-#define FFT_MUL_R_OVERLAP_A (1 << 0)
-#define FFT_MUL_R_OVERLAP_B (1 << 1)
-#define FFT_MUL_R_NORESIZE  (1 << 2)
-
-static no_inline int fft_mul(bf_context_t *s,
-                             bf_t *res, limb_t *a_tab, limb_t a_len,
-                             limb_t *b_tab, limb_t b_len, int mul_flags);
-static void fft_clear_cache(bf_context_t *s);
-#endif
-#ifdef USE_BF_DEC
-static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos);
-#endif
-
-
-/* could leading zeros */
-static inline int clz(limb_t a)
-{
-    if (a == 0) {
-        return LIMB_BITS;
-    } else {
-#if LIMB_BITS == 64
-        return clz64(a);
-#else
-        return clz32(a);
-#endif
-    }
-}
-
-static inline int ctz(limb_t a)
-{
-    if (a == 0) {
-        return LIMB_BITS;
-    } else {
-#if LIMB_BITS == 64
-        return ctz64(a);
-#else
-        return ctz32(a);
-#endif
-    }
-}
-
-static inline int ceil_log2(limb_t a)
-{
-    if (a <= 1)
-        return 0;
-    else
-        return LIMB_BITS - clz(a - 1);
-}
-
-/* b must be >= 1 */
-static inline slimb_t ceil_div(slimb_t a, slimb_t b)
-{
-    if (a >= 0)
-        return (a + b - 1) / b;
-    else
-        return a / b;
-}
-
-/* b must be >= 1 */
-static inline slimb_t floor_div(slimb_t a, slimb_t b)
-{
-    if (a >= 0) {
-        return a / b;
-    } else {
-        return (a - b + 1) / b;
-    }
-}
-
-/* return r = a modulo b (0 <= r <= b - 1. b must be >= 1 */
-static inline limb_t smod(slimb_t a, slimb_t b)
-{
-    a = a % (slimb_t)b;
-    if (a < 0)
-        a += b;
-    return a;
-}
-
-/* signed addition with saturation */
-static inline slimb_t sat_add(slimb_t a, slimb_t b)
-{
-    slimb_t r;
-    r = a + b;
-    /* overflow ? */
-    if (((a ^ r) & (b ^ r)) < 0)
-        r = (a >> (LIMB_BITS - 1)) ^ (((limb_t)1 << (LIMB_BITS - 1)) - 1);
-    return r;
-}
-
-static inline __maybe_unused limb_t shrd(limb_t low, limb_t high, long shift)
-{
-    if (shift != 0)
-        low = (low >> shift) | (high << (LIMB_BITS - shift));
-    return low;
-}
-
-static inline __maybe_unused limb_t shld(limb_t a1, limb_t a0, long shift)
-{
-    if (shift != 0)
-        return (a1 << shift) | (a0 >> (LIMB_BITS - shift));
-    else
-        return a1;
-}
-
-#define malloc(s) malloc_is_forbidden(s)
-#define free(p) free_is_forbidden(p)
-#define realloc(p, s) realloc_is_forbidden(p, s)
-
-void bf_context_init(bf_context_t *s, bf_realloc_func_t *realloc_func,
-                     void *realloc_opaque)
-{
-    memset(s, 0, sizeof(*s));
-    s->realloc_func = realloc_func;
-    s->realloc_opaque = realloc_opaque;
-}
-
-void bf_context_end(bf_context_t *s)
-{
-    bf_clear_cache(s);
-}
-
-void bf_init(bf_context_t *s, bf_t *r)
-{
-    r->ctx = s;
-    r->sign = 0;
-    r->expn = BF_EXP_ZERO;
-    r->len = 0;
-    r->tab = NULL;
-}
-
-/* return 0 if OK, -1 if alloc error */
-int bf_resize(bf_t *r, limb_t len)
-{
-    limb_t *tab;
-
-    if (len != r->len) {
-        tab = bf_realloc(r->ctx, r->tab, len * sizeof(limb_t));
-        if (!tab && len != 0)
-            return -1;
-        r->tab = tab;
-        r->len = len;
-    }
-    return 0;
-}
-
-/* return 0 or BF_ST_MEM_ERROR */
-int bf_set_ui(bf_t *r, uint64_t a)
-{
-    r->sign = 0;
-    if (a == 0) {
-        r->expn = BF_EXP_ZERO;
-        bf_resize(r, 0); /* cannot fail */
-    }
-#if LIMB_BITS == 32
-    else if (a <= 0xffffffff)
-#else
-    else
-#endif
-    {
-        int shift;
-        if (bf_resize(r, 1))
-            goto fail;
-        shift = clz(a);
-        r->tab[0] = a << shift;
-        r->expn = LIMB_BITS - shift;
-    }
-#if LIMB_BITS == 32
-    else {
-        uint32_t a1, a0;
-        int shift;
-        if (bf_resize(r, 2))
-            goto fail;
-        a0 = a;
-        a1 = a >> 32;
-        shift = clz(a1);
-        r->tab[0] = a0 << shift;
-        r->tab[1] = shld(a1, a0, shift);
-        r->expn = 2 * LIMB_BITS - shift;
-    }
-#endif
-    return 0;
- fail:
-    bf_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-/* return 0 or BF_ST_MEM_ERROR */
-int bf_set_si(bf_t *r, int64_t a)
-{
-    int ret;
-
-    if (a < 0) {
-        ret = bf_set_ui(r, -a);
-        r->sign = 1;
-    } else {
-        ret = bf_set_ui(r, a);
-    }
-    return ret;
-}
-
-void bf_set_nan(bf_t *r)
-{
-    bf_resize(r, 0); /* cannot fail */
-    r->expn = BF_EXP_NAN;
-    r->sign = 0;
-}
-
-void bf_set_zero(bf_t *r, int is_neg)
-{
-    bf_resize(r, 0); /* cannot fail */
-    r->expn = BF_EXP_ZERO;
-    r->sign = is_neg;
-}
-
-void bf_set_inf(bf_t *r, int is_neg)
-{
-    bf_resize(r, 0); /* cannot fail */
-    r->expn = BF_EXP_INF;
-    r->sign = is_neg;
-}
-
-/* return 0 or BF_ST_MEM_ERROR */
-int bf_set(bf_t *r, const bf_t *a)
-{
-    if (r == a)
-        return 0;
-    if (bf_resize(r, a->len)) {
-        bf_set_nan(r);
-        return BF_ST_MEM_ERROR;
-    }
-    r->sign = a->sign;
-    r->expn = a->expn;
-    if (a->len > 0)
-        memcpy(r->tab, a->tab, a->len * sizeof(limb_t));
-    return 0;
-}
-
-/* equivalent to bf_set(r, a); bf_delete(a) */
-void bf_move(bf_t *r, bf_t *a)
-{
-    bf_context_t *s = r->ctx;
-    if (r == a)
-        return;
-    bf_free(s, r->tab);
-    *r = *a;
-}
-
-static limb_t get_limbz(const bf_t *a, limb_t idx)
-{
-    if (idx >= a->len)
-        return 0;
-    else
-        return a->tab[idx];
-}
-
-/* get LIMB_BITS at bit position 'pos' in tab */
-static inline limb_t get_bits(const limb_t *tab, limb_t len, slimb_t pos)
-{
-    limb_t i, a0, a1;
-    int p;
-
-    i = pos >> LIMB_LOG2_BITS;
-    p = pos & (LIMB_BITS - 1);
-    if (i < len)
-        a0 = tab[i];
-    else
-        a0 = 0;
-    if (p == 0) {
-        return a0;
-    } else {
-        i++;
-        if (i < len)
-            a1 = tab[i];
-        else
-            a1 = 0;
-        return (a0 >> p) | (a1 << (LIMB_BITS - p));
-    }
-}
-
-static inline limb_t get_bit(const limb_t *tab, limb_t len, slimb_t pos)
-{
-    slimb_t i;
-    i = pos >> LIMB_LOG2_BITS;
-    if (i < 0 || i >= len)
-        return 0;
-    return (tab[i] >> (pos & (LIMB_BITS - 1))) & 1;
-}
-
-static inline limb_t limb_mask(int start, int last)
-{
-    limb_t v;
-    int n;
-    n = last - start + 1;
-    if (n == LIMB_BITS)
-        v = -1;
-    else
-        v = (((limb_t)1 << n) - 1) << start;
-    return v;
-}
-
-static limb_t mp_scan_nz(const limb_t *tab, mp_size_t n)
-{
-    mp_size_t i;
-    for(i = 0; i < n; i++) {
-        if (tab[i] != 0)
-            return 1;
-    }
-    return 0;
-}
-
-/* return != 0 if one bit between 0 and bit_pos inclusive is not zero. */
-static inline limb_t scan_bit_nz(const bf_t *r, slimb_t bit_pos)
-{
-    slimb_t pos;
-    limb_t v;
-
-    pos = bit_pos >> LIMB_LOG2_BITS;
-    if (pos < 0)
-        return 0;
-    v = r->tab[pos] & limb_mask(0, bit_pos & (LIMB_BITS - 1));
-    if (v != 0)
-        return 1;
-    pos--;
-    while (pos >= 0) {
-        if (r->tab[pos] != 0)
-            return 1;
-        pos--;
-    }
-    return 0;
-}
-
-/* return the addend for rounding. Note that prec can be <= 0 (for
-   BF_FLAG_RADPNT_PREC) */
-static int bf_get_rnd_add(int *pret, const bf_t *r, limb_t l,
-                          slimb_t prec, int rnd_mode)
-{
-    int add_one, inexact;
-    limb_t bit1, bit0;
-
-    if (rnd_mode == BF_RNDF) {
-        bit0 = 1; /* faithful rounding does not honor the INEXACT flag */
-    } else {
-        /* starting limb for bit 'prec + 1' */
-        bit0 = scan_bit_nz(r, l * LIMB_BITS - 1 - bf_max(0, prec + 1));
-    }
-
-    /* get the bit at 'prec' */
-    bit1 = get_bit(r->tab, l, l * LIMB_BITS - 1 - prec);
-    inexact = (bit1 | bit0) != 0;
-
-    add_one = 0;
-    switch(rnd_mode) {
-    case BF_RNDZ:
-        break;
-    case BF_RNDN:
-        if (bit1) {
-            if (bit0) {
-                add_one = 1;
-            } else {
-                /* round to even */
-                add_one =
-                    get_bit(r->tab, l, l * LIMB_BITS - 1 - (prec - 1));
-            }
-        }
-        break;
-    case BF_RNDD:
-    case BF_RNDU:
-        if (r->sign == (rnd_mode == BF_RNDD))
-            add_one = inexact;
-        break;
-    case BF_RNDA:
-        add_one = inexact;
-        break;
-    case BF_RNDNA:
-    case BF_RNDF:
-        add_one = bit1;
-        break;
-    default:
-        abort();
-    }
-
-    if (inexact)
-        *pret |= BF_ST_INEXACT;
-    return add_one;
-}
-
-static int bf_set_overflow(bf_t *r, int sign, limb_t prec, bf_flags_t flags)
-{
-    slimb_t i, l, e_max;
-    int rnd_mode;
-
-    rnd_mode = flags & BF_RND_MASK;
-    if (prec == BF_PREC_INF ||
-        rnd_mode == BF_RNDN ||
-        rnd_mode == BF_RNDNA ||
-        rnd_mode == BF_RNDA ||
-        (rnd_mode == BF_RNDD && sign == 1) ||
-        (rnd_mode == BF_RNDU && sign == 0)) {
-        bf_set_inf(r, sign);
-    } else {
-        /* set to maximum finite number */
-        l = (prec + LIMB_BITS - 1) / LIMB_BITS;
-        if (bf_resize(r, l)) {
-            bf_set_nan(r);
-            return BF_ST_MEM_ERROR;
-        }
-        r->tab[0] = limb_mask((-prec) & (LIMB_BITS - 1),
-                              LIMB_BITS - 1);
-        for(i = 1; i < l; i++)
-            r->tab[i] = (limb_t)-1;
-        e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
-        r->expn = e_max;
-        r->sign = sign;
-    }
-    return BF_ST_OVERFLOW | BF_ST_INEXACT;
-}
-
-/* round to prec1 bits assuming 'r' is non zero and finite. 'r' is
-   assumed to have length 'l' (1 <= l <= r->len). Note: 'prec1' can be
-   infinite (BF_PREC_INF). 'ret' is 0 or BF_ST_INEXACT if the result
-   is known to be inexact. Can fail with BF_ST_MEM_ERROR in case of
-   overflow not returning infinity. */
-static int __bf_round(bf_t *r, limb_t prec1, bf_flags_t flags, limb_t l,
-                      int ret)
-{
-    limb_t v, a;
-    int shift, add_one, rnd_mode;
-    slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec;
-
-    /* e_min and e_max are computed to match the IEEE 754 conventions */
-    e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
-    e_min = -e_range + 3;
-    e_max = e_range;
-
-    if (flags & BF_FLAG_RADPNT_PREC) {
-        /* 'prec' is the precision after the radix point */
-        if (prec1 != BF_PREC_INF)
-            prec = r->expn + prec1;
-        else
-            prec = prec1;
-    } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) {
-        /* restrict the precision in case of potentially subnormal
-           result */
-        assert(prec1 != BF_PREC_INF);
-        prec = prec1 - (e_min - r->expn);
-    } else {
-        prec = prec1;
-    }
-
-    /* round to prec bits */
-    rnd_mode = flags & BF_RND_MASK;
-    add_one = bf_get_rnd_add(&ret, r, l, prec, rnd_mode);
-
-    if (prec <= 0) {
-        if (add_one) {
-            bf_resize(r, 1); /* cannot fail */
-            r->tab[0] = (limb_t)1 << (LIMB_BITS - 1);
-            r->expn += 1 - prec;
-            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
-            return ret;
-        } else {
-            goto underflow;
-        }
-    } else if (add_one) {
-        limb_t carry;
-
-        /* add one starting at digit 'prec - 1' */
-        bit_pos = l * LIMB_BITS - 1 - (prec - 1);
-        pos = bit_pos >> LIMB_LOG2_BITS;
-        carry = (limb_t)1 << (bit_pos & (LIMB_BITS - 1));
-
-        for(i = pos; i < l; i++) {
-            v = r->tab[i] + carry;
-            carry = (v < carry);
-            r->tab[i] = v;
-            if (carry == 0)
-                break;
-        }
-        if (carry) {
-            /* shift right by one digit */
-            v = 1;
-            for(i = l - 1; i >= pos; i--) {
-                a = r->tab[i];
-                r->tab[i] = (a >> 1) | (v << (LIMB_BITS - 1));
-                v = a;
-            }
-            r->expn++;
-        }
-    }
-
-    /* check underflow */
-    if (unlikely(r->expn < e_min)) {
-        if (flags & BF_FLAG_SUBNORMAL) {
-            /* if inexact, also set the underflow flag */
-            if (ret & BF_ST_INEXACT)
-                ret |= BF_ST_UNDERFLOW;
-        } else {
-        underflow:
-            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
-            bf_set_zero(r, r->sign);
-            return ret;
-        }
-    }
-
-    /* check overflow */
-    if (unlikely(r->expn > e_max))
-        return bf_set_overflow(r, r->sign, prec1, flags);
-
-    /* keep the bits starting at 'prec - 1' */
-    bit_pos = l * LIMB_BITS - 1 - (prec - 1);
-    i = bit_pos >> LIMB_LOG2_BITS;
-    if (i >= 0) {
-        shift = bit_pos & (LIMB_BITS - 1);
-        if (shift != 0)
-            r->tab[i] &= limb_mask(shift, LIMB_BITS - 1);
-    } else {
-        i = 0;
-    }
-    /* remove trailing zeros */
-    while (r->tab[i] == 0)
-        i++;
-    if (i > 0) {
-        l -= i;
-        memmove(r->tab, r->tab + i, l * sizeof(limb_t));
-    }
-    bf_resize(r, l); /* cannot fail */
-    return ret;
-}
-
-/* 'r' must be a finite number. */
-int bf_normalize_and_round(bf_t *r, limb_t prec1, bf_flags_t flags)
-{
-    limb_t l, v, a;
-    int shift, ret;
-    slimb_t i;
-
-    //    bf_print_str("bf_renorm", r);
-    l = r->len;
-    while (l > 0 && r->tab[l - 1] == 0)
-        l--;
-    if (l == 0) {
-        /* zero */
-        r->expn = BF_EXP_ZERO;
-        bf_resize(r, 0); /* cannot fail */
-        ret = 0;
-    } else {
-        r->expn -= (r->len - l) * LIMB_BITS;
-        /* shift to have the MSB set to '1' */
-        v = r->tab[l - 1];
-        shift = clz(v);
-        if (shift != 0) {
-            v = 0;
-            for(i = 0; i < l; i++) {
-                a = r->tab[i];
-                r->tab[i] = (a << shift) | (v >> (LIMB_BITS - shift));
-                v = a;
-            }
-            r->expn -= shift;
-        }
-        ret = __bf_round(r, prec1, flags, l, 0);
-    }
-    //    bf_print_str("r_final", r);
-    return ret;
-}
-
-/* return true if rounding can be done at precision 'prec' assuming
-   the exact result r is such that |r-a| <= 2^(EXP(a)-k). */
-/* XXX: check the case where the exponent would be incremented by the
-   rounding */
-int bf_can_round(const bf_t *a, slimb_t prec, bf_rnd_t rnd_mode, slimb_t k)
-{
-    bool is_rndn;
-    slimb_t bit_pos, n;
-    limb_t bit;
-
-    if (a->expn == BF_EXP_INF || a->expn == BF_EXP_NAN)
-        return false;
-    if (rnd_mode == BF_RNDF) {
-        return (k >= (prec + 1));
-    }
-    if (a->expn == BF_EXP_ZERO)
-        return false;
-    is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA);
-    if (k < (prec + 2))
-        return false;
-    bit_pos = a->len * LIMB_BITS - 1 - prec;
-    n = k - prec;
-    /* bit pattern for RNDN or RNDNA: 0111.. or 1000...
-       for other rounding modes: 000... or 111...
-    */
-    bit = get_bit(a->tab, a->len, bit_pos);
-    bit_pos--;
-    n--;
-    bit ^= is_rndn;
-    /* XXX: slow, but a few iterations on average */
-    while (n != 0) {
-        if (get_bit(a->tab, a->len, bit_pos) != bit)
-            return true;
-        bit_pos--;
-        n--;
-    }
-    return false;
-}
-
-/* Cannot fail with BF_ST_MEM_ERROR. */
-int bf_round(bf_t *r, limb_t prec, bf_flags_t flags)
-{
-    if (r->len == 0)
-        return 0;
-    return __bf_round(r, prec, flags, r->len, 0);
-}
-
-/* for debugging */
-static __maybe_unused void dump_limbs(const char *str, const limb_t *tab, limb_t n)
-{
-    limb_t i;
-    printf("%s: len=%" PRId_LIMB "\n", str, n);
-    for(i = 0; i < n; i++) {
-        printf("%" PRId_LIMB ": " FMT_LIMB "\n",
-               i, tab[i]);
-    }
-}
-
-void mp_print_str(const char *str, const limb_t *tab, limb_t n)
-{
-    slimb_t i;
-    printf("%s= 0x", str);
-    for(i = n - 1; i >= 0; i--) {
-        if (i != (n - 1))
-            printf("_");
-        printf(FMT_LIMB, tab[i]);
-    }
-    printf("\n");
-}
-
-static __maybe_unused void mp_print_str_h(const char *str,
-                                          const limb_t *tab, limb_t n,
-                                          limb_t high)
-{
-    slimb_t i;
-    printf("%s= 0x", str);
-    printf(FMT_LIMB, high);
-    for(i = n - 1; i >= 0; i--) {
-        printf("_");
-        printf(FMT_LIMB, tab[i]);
-    }
-    printf("\n");
-}
-
-/* for debugging */
-void bf_print_str(const char *str, const bf_t *a)
-{
-    slimb_t i;
-    printf("%s=", str);
-
-    if (a->expn == BF_EXP_NAN) {
-        printf("NaN");
-    } else {
-        if (a->sign)
-            putchar('-');
-        if (a->expn == BF_EXP_ZERO) {
-            putchar('0');
-        } else if (a->expn == BF_EXP_INF) {
-            printf("Inf");
-        } else {
-            printf("0x0.");
-            for(i = a->len - 1; i >= 0; i--)
-                printf(FMT_LIMB, a->tab[i]);
-            printf("p%" PRId_LIMB, a->expn);
-        }
-    }
-    printf("\n");
-}
-
-/* compare the absolute value of 'a' and 'b'. Return < 0 if a < b, 0
-   if a = b and > 0 otherwise. */
-int bf_cmpu(const bf_t *a, const bf_t *b)
-{
-    slimb_t i;
-    limb_t len, v1, v2;
-
-    if (a->expn != b->expn) {
-        if (a->expn < b->expn)
-            return -1;
-        else
-            return 1;
-    }
-    len = bf_max(a->len, b->len);
-    for(i = len - 1; i >= 0; i--) {
-        v1 = get_limbz(a, a->len - len + i);
-        v2 = get_limbz(b, b->len - len + i);
-        if (v1 != v2) {
-            if (v1 < v2)
-                return -1;
-            else
-                return 1;
-        }
-    }
-    return 0;
-}
-
-/* Full order: -0 < 0, NaN == NaN and NaN is larger than all other numbers */
-int bf_cmp_full(const bf_t *a, const bf_t *b)
-{
-    int res;
-
-    if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-        if (a->expn == b->expn)
-            res = 0;
-        else if (a->expn == BF_EXP_NAN)
-            res = 1;
-        else
-            res = -1;
-    } else if (a->sign != b->sign) {
-        res = 1 - 2 * a->sign;
-    } else {
-        res = bf_cmpu(a, b);
-        if (a->sign)
-            res = -res;
-    }
-    return res;
-}
-
-/* Standard floating point comparison: return 2 if one of the operands
-   is NaN (unordered) or -1, 0, 1 depending on the ordering assuming
-   -0 == +0 */
-int bf_cmp(const bf_t *a, const bf_t *b)
-{
-    int res;
-
-    if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-        res = 2;
-    } else if (a->sign != b->sign) {
-        if (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_ZERO)
-            res = 0;
-        else
-            res = 1 - 2 * a->sign;
-    } else {
-        res = bf_cmpu(a, b);
-        if (a->sign)
-            res = -res;
-    }
-    return res;
-}
-
-/* Compute the number of bits 'n' matching the pattern:
-   a= X1000..0
-   b= X0111..1
-
-   When computing a-b, the result will have at least n leading zero
-   bits.
-
-   Precondition: a > b and a.expn - b.expn = 0 or 1
-*/
-static limb_t count_cancelled_bits(const bf_t *a, const bf_t *b)
-{
-    slimb_t bit_offset, b_offset, n;
-    int p, p1;
-    limb_t v1, v2, mask;
-
-    bit_offset = a->len * LIMB_BITS - 1;
-    b_offset = (b->len - a->len) * LIMB_BITS - (LIMB_BITS - 1) +
-        a->expn - b->expn;
-    n = 0;
-
-    /* first search the equals bits */
-    for(;;) {
-        v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS);
-        v2 = get_bits(b->tab, b->len, bit_offset + b_offset);
-        //        printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2);
-        if (v1 != v2)
-            break;
-        n += LIMB_BITS;
-        bit_offset -= LIMB_BITS;
-    }
-    /* find the position of the first different bit */
-    p = clz(v1 ^ v2) + 1;
-    n += p;
-    /* then search for '0' in a and '1' in b */
-    p = LIMB_BITS - p;
-    if (p > 0) {
-        /* search in the trailing p bits of v1 and v2 */
-        mask = limb_mask(0, p - 1);
-        p1 = bf_min(clz(v1 & mask), clz((~v2) & mask)) - (LIMB_BITS - p);
-        n += p1;
-        if (p1 != p)
-            goto done;
-    }
-    bit_offset -= LIMB_BITS;
-    for(;;) {
-        v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS);
-        v2 = get_bits(b->tab, b->len, bit_offset + b_offset);
-        //        printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2);
-        if (v1 != 0 || v2 != -1) {
-            /* different: count the matching bits */
-            p1 = bf_min(clz(v1), clz(~v2));
-            n += p1;
-            break;
-        }
-        n += LIMB_BITS;
-        bit_offset -= LIMB_BITS;
-    }
- done:
-    return n;
-}
-
-static int bf_add_internal(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-                           bf_flags_t flags, int b_neg)
-{
-    const bf_t *tmp;
-    int is_sub, ret, cmp_res, a_sign, b_sign;
-
-    a_sign = a->sign;
-    b_sign = b->sign ^ b_neg;
-    is_sub = a_sign ^ b_sign;
-    cmp_res = bf_cmpu(a, b);
-    if (cmp_res < 0) {
-        tmp = a;
-        a = b;
-        b = tmp;
-        a_sign = b_sign; /* b_sign is never used later */
-    }
-    /* abs(a) >= abs(b) */
-    if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) {
-        /* zero result */
-        bf_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD);
-        ret = 0;
-    } else if (a->len == 0 || b->len == 0) {
-        ret = 0;
-        if (a->expn >= BF_EXP_INF) {
-            if (a->expn == BF_EXP_NAN) {
-                /* at least one operand is NaN */
-                bf_set_nan(r);
-            } else if (b->expn == BF_EXP_INF && is_sub) {
-                /* infinities with different signs */
-                bf_set_nan(r);
-                ret = BF_ST_INVALID_OP;
-            } else {
-                bf_set_inf(r, a_sign);
-            }
-        } else {
-            /* at least one zero and not subtract */
-            bf_set(r, a);
-            r->sign = a_sign;
-            goto renorm;
-        }
-    } else {
-        slimb_t d, a_offset, b_bit_offset, i, cancelled_bits;
-        limb_t carry, v1, v2, u, r_len, carry1, precl, tot_len, z, sub_mask;
-
-        r->sign = a_sign;
-        r->expn = a->expn;
-        d = a->expn - b->expn;
-        /* must add more precision for the leading cancelled bits in
-           subtraction */
-        if (is_sub) {
-            if (d <= 1)
-                cancelled_bits = count_cancelled_bits(a, b);
-            else
-                cancelled_bits = 1;
-        } else {
-            cancelled_bits = 0;
-        }
-
-        /* add two extra bits for rounding */
-        precl = (cancelled_bits + prec + 2 + LIMB_BITS - 1) / LIMB_BITS;
-        tot_len = bf_max(a->len, b->len + (d + LIMB_BITS - 1) / LIMB_BITS);
-        r_len = bf_min(precl, tot_len);
-        if (bf_resize(r, r_len))
-            goto fail;
-        a_offset = a->len - r_len;
-        b_bit_offset = (b->len - r_len) * LIMB_BITS + d;
-
-        /* compute the bits before for the rounding */
-        carry = is_sub;
-        z = 0;
-        sub_mask = -is_sub;
-        i = r_len - tot_len;
-        while (i < 0) {
-            slimb_t ap, bp;
-            bool inflag;
-
-            ap = a_offset + i;
-            bp = b_bit_offset + i * LIMB_BITS;
-            inflag = false;
-            if (ap >= 0 && ap < a->len) {
-                v1 = a->tab[ap];
-                inflag = true;
-            } else {
-                v1 = 0;
-            }
-            if (bp + LIMB_BITS > 0 && bp < (slimb_t)(b->len * LIMB_BITS)) {
-                v2 = get_bits(b->tab, b->len, bp);
-                inflag = true;
-            } else {
-                v2 = 0;
-            }
-            if (!inflag) {
-                /* outside 'a' and 'b': go directly to the next value
-                   inside a or b so that the running time does not
-                   depend on the exponent difference */
-                i = 0;
-                if (ap < 0)
-                    i = bf_min(i, -a_offset);
-                /* b_bit_offset + i * LIMB_BITS + LIMB_BITS >= 1
-                   equivalent to
-                   i >= ceil(-b_bit_offset + 1 - LIMB_BITS) / LIMB_BITS)
-                */
-                if (bp + LIMB_BITS <= 0)
-                    i = bf_min(i, (-b_bit_offset) >> LIMB_LOG2_BITS);
-            } else {
-                i++;
-            }
-            v2 ^= sub_mask;
-            u = v1 + v2;
-            carry1 = u < v1;
-            u += carry;
-            carry = (u < carry) | carry1;
-            z |= u;
-        }
-        /* and the result */
-        for(i = 0; i < r_len; i++) {
-            v1 = get_limbz(a, a_offset + i);
-            v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS);
-            v2 ^= sub_mask;
-            u = v1 + v2;
-            carry1 = u < v1;
-            u += carry;
-            carry = (u < carry) | carry1;
-            r->tab[i] = u;
-        }
-        /* set the extra bits for the rounding */
-        r->tab[0] |= (z != 0);
-
-        /* carry is only possible in add case */
-        if (!is_sub && carry) {
-            if (bf_resize(r, r_len + 1))
-                goto fail;
-            r->tab[r_len] = 1;
-            r->expn += LIMB_BITS;
-        }
-    renorm:
-        ret = bf_normalize_and_round(r, prec, flags);
-    }
-    return ret;
- fail:
-    bf_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-static int __bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-                     bf_flags_t flags)
-{
-    return bf_add_internal(r, a, b, prec, flags, 0);
-}
-
-static int __bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-                     bf_flags_t flags)
-{
-    return bf_add_internal(r, a, b, prec, flags, 1);
-}
-
-limb_t mp_add(limb_t *res, const limb_t *op1, const limb_t *op2,
-              limb_t n, limb_t carry)
-{
-    slimb_t i;
-    limb_t k, a, v, k1;
-
-    k = carry;
-    for(i=0;i<n;i++) {
-        v = op1[i];
-        a = v + op2[i];
-        k1 = a < v;
-        a = a + k;
-        k = (a < k) | k1;
-        res[i] = a;
-    }
-    return k;
-}
-
-limb_t mp_add_ui(limb_t *tab, limb_t b, size_t n)
-{
-    size_t i;
-    limb_t k, a;
-
-    k=b;
-    for(i=0;i<n;i++) {
-        if (k == 0)
-            break;
-        a = tab[i] + k;
-        k = (a < k);
-        tab[i] = a;
-    }
-    return k;
-}
-
-limb_t mp_sub(limb_t *res, const limb_t *op1, const limb_t *op2,
-              mp_size_t n, limb_t carry)
-{
-    int i;
-    limb_t k, a, v, k1;
-
-    k = carry;
-    for(i=0;i<n;i++) {
-        v = op1[i];
-        a = v - op2[i];
-        k1 = a > v;
-        v = a - k;
-        k = (v > a) | k1;
-        res[i] = v;
-    }
-    return k;
-}
-
-/* compute 0 - op2 */
-static limb_t mp_neg(limb_t *res, const limb_t *op2, mp_size_t n, limb_t carry)
-{
-    int i;
-    limb_t k, a, v, k1;
-
-    k = carry;
-    for(i=0;i<n;i++) {
-        v = 0;
-        a = v - op2[i];
-        k1 = a > v;
-        v = a - k;
-        k = (v > a) | k1;
-        res[i] = v;
-    }
-    return k;
-}
-
-limb_t mp_sub_ui(limb_t *tab, limb_t b, mp_size_t n)
-{
-    mp_size_t i;
-    limb_t k, a, v;
-
-    k=b;
-    for(i=0;i<n;i++) {
-        v = tab[i];
-        a = v - k;
-        k = a > v;
-        tab[i] = a;
-        if (k == 0)
-            break;
-    }
-    return k;
-}
-
-/* r = (a + high*B^n) >> shift. Return the remainder r (0 <= r < 2^shift).
-   1 <= shift <= LIMB_BITS - 1 */
-static limb_t mp_shr(limb_t *tab_r, const limb_t *tab, mp_size_t n,
-                     int shift, limb_t high)
-{
-    mp_size_t i;
-    limb_t l, a;
-
-    assert(shift >= 1 && shift < LIMB_BITS);
-    l = high;
-    for(i = n - 1; i >= 0; i--) {
-        a = tab[i];
-        tab_r[i] = (a >> shift) | (l << (LIMB_BITS - shift));
-        l = a;
-    }
-    return l & (((limb_t)1 << shift) - 1);
-}
-
-/* tabr[] = taba[] * b + l. Return the high carry */
-static limb_t mp_mul1(limb_t *tabr, const limb_t *taba, limb_t n,
-                      limb_t b, limb_t l)
-{
-    limb_t i;
-    dlimb_t t;
-
-    for(i = 0; i < n; i++) {
-        t = (dlimb_t)taba[i] * (dlimb_t)b + l;
-        tabr[i] = t;
-        l = t >> LIMB_BITS;
-    }
-    return l;
-}
-
-/* tabr[] += taba[] * b, return the high word. */
-static limb_t mp_add_mul1(limb_t *tabr, const limb_t *taba, limb_t n,
-                          limb_t b)
-{
-    limb_t i, l;
-    dlimb_t t;
-
-    l = 0;
-    for(i = 0; i < n; i++) {
-        t = (dlimb_t)taba[i] * (dlimb_t)b + l + tabr[i];
-        tabr[i] = t;
-        l = t >> LIMB_BITS;
-    }
-    return l;
-}
-
-/* size of the result : op1_size + op2_size. */
-static void mp_mul_basecase(limb_t *result,
-                            const limb_t *op1, limb_t op1_size,
-                            const limb_t *op2, limb_t op2_size)
-{
-    limb_t i, r;
-
-    result[op1_size] = mp_mul1(result, op1, op1_size, op2[0], 0);
-    for(i=1;i<op2_size;i++) {
-        r = mp_add_mul1(result + i, op1, op1_size, op2[i]);
-        result[i + op1_size] = r;
-    }
-}
-
-/* return 0 if OK, -1 if memory error */
-/* XXX: change API so that result can be allocated */
-int mp_mul(bf_context_t *s, limb_t *result,
-           const limb_t *op1, limb_t op1_size,
-           const limb_t *op2, limb_t op2_size)
-{
-#ifdef USE_FFT_MUL
-    if (unlikely(bf_min(op1_size, op2_size) >= FFT_MUL_THRESHOLD)) {
-        bf_t r_s, *r = &r_s;
-        r->tab = result;
-        /* XXX: optimize memory usage in API */
-        if (fft_mul(s, r, (limb_t *)op1, op1_size,
-                    (limb_t *)op2, op2_size, FFT_MUL_R_NORESIZE))
-            return -1;
-    } else
-#endif
-    {
-        mp_mul_basecase(result, op1, op1_size, op2, op2_size);
-    }
-    return 0;
-}
-
-/* tabr[] -= taba[] * b. Return the value to substract to the high
-   word. */
-static limb_t mp_sub_mul1(limb_t *tabr, const limb_t *taba, limb_t n,
-                          limb_t b)
-{
-    limb_t i, l;
-    dlimb_t t;
-
-    l = 0;
-    for(i = 0; i < n; i++) {
-        t = tabr[i] - (dlimb_t)taba[i] * (dlimb_t)b - l;
-        tabr[i] = t;
-        l = -(t >> LIMB_BITS);
-    }
-    return l;
-}
-
-/* WARNING: d must be >= 2^(LIMB_BITS-1) */
-static inline limb_t udiv1norm_init(limb_t d)
-{
-    limb_t a0, a1;
-    a1 = -d - 1;
-    a0 = -1;
-    return (((dlimb_t)a1 << LIMB_BITS) | a0) / d;
-}
-
-/* return the quotient and the remainder in '*pr'of 'a1*2^LIMB_BITS+a0
-   / d' with 0 <= a1 < d. */
-static inline limb_t udiv1norm(limb_t *pr, limb_t a1, limb_t a0,
-                                limb_t d, limb_t d_inv)
-{
-    limb_t n1m, n_adj, q, r, ah;
-    dlimb_t a;
-    n1m = ((slimb_t)a0 >> (LIMB_BITS - 1));
-    n_adj = a0 + (n1m & d);
-    a = (dlimb_t)d_inv * (a1 - n1m) + n_adj;
-    q = (a >> LIMB_BITS) + a1;
-    /* compute a - q * r and update q so that the remainder is\
-       between 0 and d - 1 */
-    a = ((dlimb_t)a1 << LIMB_BITS) | a0;
-    a = a - (dlimb_t)q * d - d;
-    ah = a >> LIMB_BITS;
-    q += 1 + ah;
-    r = (limb_t)a + (ah & d);
-    *pr = r;
-    return q;
-}
-
-/* b must be >= 1 << (LIMB_BITS - 1) */
-static limb_t mp_div1norm(limb_t *tabr, const limb_t *taba, limb_t n,
-                          limb_t b, limb_t r)
-{
-    slimb_t i;
-
-    if (n >= UDIV1NORM_THRESHOLD) {
-        limb_t b_inv;
-        b_inv = udiv1norm_init(b);
-        for(i = n - 1; i >= 0; i--) {
-            tabr[i] = udiv1norm(&r, r, taba[i], b, b_inv);
-        }
-    } else {
-        dlimb_t a1;
-        for(i = n - 1; i >= 0; i--) {
-            a1 = ((dlimb_t)r << LIMB_BITS) | taba[i];
-            tabr[i] = a1 / b;
-            r = a1 % b;
-        }
-    }
-    return r;
-}
-
-static int mp_divnorm_large(bf_context_t *s,
-                            limb_t *tabq, limb_t *taba, limb_t na,
-                            const limb_t *tabb, limb_t nb);
-
-/* base case division: divides taba[0..na-1] by tabb[0..nb-1]. tabb[nb
-   - 1] must be >= 1 << (LIMB_BITS - 1). na - nb must be >= 0. 'taba'
-   is modified and contains the remainder (nb limbs). tabq[0..na-nb]
-   contains the quotient with tabq[na - nb] <= 1. */
-static int mp_divnorm(bf_context_t *s, limb_t *tabq, limb_t *taba, limb_t na,
-                      const limb_t *tabb, limb_t nb)
-{
-    limb_t r, a, c, q, v, b1, b1_inv, n, dummy_r;
-    slimb_t i, j;
-
-    b1 = tabb[nb - 1];
-    if (nb == 1) {
-        taba[0] = mp_div1norm(tabq, taba, na, b1, 0);
-        return 0;
-    }
-    n = na - nb;
-    if (bf_min(n, nb) >= DIVNORM_LARGE_THRESHOLD) {
-        return mp_divnorm_large(s, tabq, taba, na, tabb, nb);
-    }
-
-    if (n >= UDIV1NORM_THRESHOLD)
-        b1_inv = udiv1norm_init(b1);
-    else
-        b1_inv = 0;
-
-    /* first iteration: the quotient is only 0 or 1 */
-    q = 1;
-    for(j = nb - 1; j >= 0; j--) {
-        if (taba[n + j] != tabb[j]) {
-            if (taba[n + j] < tabb[j])
-                q = 0;
-            break;
-        }
-    }
-    tabq[n] = q;
-    if (q) {
-        mp_sub(taba + n, taba + n, tabb, nb, 0);
-    }
-
-    for(i = n - 1; i >= 0; i--) {
-        if (unlikely(taba[i + nb] >= b1)) {
-            q = -1;
-        } else if (b1_inv) {
-            q = udiv1norm(&dummy_r, taba[i + nb], taba[i + nb - 1], b1, b1_inv);
-        } else {
-            dlimb_t al;
-            al = ((dlimb_t)taba[i + nb] << LIMB_BITS) | taba[i + nb - 1];
-            q = al / b1;
-            r = al % b1;
-        }
-        r = mp_sub_mul1(taba + i, tabb, nb, q);
-
-        v = taba[i + nb];
-        a = v - r;
-        c = (a > v);
-        taba[i + nb] = a;
-
-        if (c != 0) {
-            /* negative result */
-            for(;;) {
-                q--;
-                c = mp_add(taba + i, taba + i, tabb, nb, 0);
-                /* propagate carry and test if positive result */
-                if (c != 0) {
-                    if (++taba[i + nb] == 0) {
-                        break;
-                    }
-                }
-            }
-        }
-        tabq[i] = q;
-    }
-    return 0;
-}
-
-/* compute r=B^(2*n)/a such as a*r < B^(2*n) < a*r + 2 with n >= 1. 'a'
-   has n limbs with a[n-1] >= B/2 and 'r' has n+1 limbs with r[n] = 1.
-
-   See Modern Computer Arithmetic by Richard P. Brent and Paul
-   Zimmermann, algorithm 3.5 */
-int mp_recip(bf_context_t *s, limb_t *tabr, const limb_t *taba, limb_t n)
-{
-    mp_size_t l, h, k, i;
-    limb_t *tabxh, *tabt, c, *tabu;
-
-    if (n <= 2) {
-        /* return ceil(B^(2*n)/a) - 1 */
-        /* XXX: could avoid allocation */
-        tabu = bf_malloc(s, sizeof(limb_t) * (2 * n + 1));
-        tabt = bf_malloc(s, sizeof(limb_t) * (n + 2));
-        if (!tabt || !tabu)
-            goto fail;
-        for(i = 0; i < 2 * n; i++)
-            tabu[i] = 0;
-        tabu[2 * n] = 1;
-        if (mp_divnorm(s, tabt, tabu, 2 * n + 1, taba, n))
-            goto fail;
-        for(i = 0; i < n + 1; i++)
-            tabr[i] = tabt[i];
-        if (mp_scan_nz(tabu, n) == 0) {
-            /* only happens for a=B^n/2 */
-            mp_sub_ui(tabr, 1, n + 1);
-        }
-    } else {
-        l = (n - 1) / 2;
-        h = n - l;
-        /* n=2p  -> l=p-1, h = p + 1, k = p + 3
-           n=2p+1-> l=p,  h = p + 1; k = p + 2
-        */
-        tabt = bf_malloc(s, sizeof(limb_t) * (n + h + 1));
-        tabu = bf_malloc(s, sizeof(limb_t) * (n + 2 * h - l + 2));
-        if (!tabt || !tabu)
-            goto fail;
-        tabxh = tabr + l;
-        if (mp_recip(s, tabxh, taba + l, h))
-            goto fail;
-        if (mp_mul(s, tabt, taba, n, tabxh, h + 1)) /* n + h + 1 limbs */
-            goto fail;
-        while (tabt[n + h] != 0) {
-            mp_sub_ui(tabxh, 1, h + 1);
-            c = mp_sub(tabt, tabt, taba, n, 0);
-            mp_sub_ui(tabt + n, c, h + 1);
-        }
-        /* T = B^(n+h) - T */
-        mp_neg(tabt, tabt, n + h + 1, 0);
-        tabt[n + h]++;
-        if (mp_mul(s, tabu, tabt + l, n + h + 1 - l, tabxh, h + 1))
-            goto fail;
-        /* n + 2*h - l + 2 limbs */
-        k = 2 * h - l;
-        for(i = 0; i < l; i++)
-            tabr[i] = tabu[i + k];
-        mp_add(tabr + l, tabr + l, tabu + 2 * h, h, 0);
-    }
-    bf_free(s, tabt);
-    bf_free(s, tabu);
-    return 0;
- fail:
-    bf_free(s, tabt);
-    bf_free(s, tabu);
-    return -1;
-}
-
-/* return -1, 0 or 1 */
-static int mp_cmp(const limb_t *taba, const limb_t *tabb, mp_size_t n)
-{
-    mp_size_t i;
-    for(i = n - 1; i >= 0; i--) {
-        if (taba[i] != tabb[i]) {
-            if (taba[i] < tabb[i])
-                return -1;
-            else
-                return 1;
-        }
-    }
-    return 0;
-}
-
-//#define DEBUG_DIVNORM_LARGE
-//#define DEBUG_DIVNORM_LARGE2
-
-/* subquadratic divnorm */
-static int mp_divnorm_large(bf_context_t *s,
-                            limb_t *tabq, limb_t *taba, limb_t na,
-                            const limb_t *tabb, limb_t nb)
-{
-    limb_t *tabb_inv, nq, *tabt, i, n;
-    nq = na - nb;
-#ifdef DEBUG_DIVNORM_LARGE
-    printf("na=%d nb=%d nq=%d\n", (int)na, (int)nb, (int)nq);
-    mp_print_str("a", taba, na);
-    mp_print_str("b", tabb, nb);
-#endif
-    assert(nq >= 1);
-    n = nq;
-    if (nq < nb)
-        n++;
-    tabb_inv = bf_malloc(s, sizeof(limb_t) * (n + 1));
-    tabt = bf_malloc(s, sizeof(limb_t) * 2 * (n + 1));
-    if (!tabb_inv || !tabt)
-        goto fail;
-
-    if (n >= nb) {
-        for(i = 0; i < n - nb; i++)
-            tabt[i] = 0;
-        for(i = 0; i < nb; i++)
-            tabt[i + n - nb] = tabb[i];
-    } else {
-        /* truncate B: need to increment it so that the approximate
-           inverse is smaller that the exact inverse */
-        for(i = 0; i < n; i++)
-            tabt[i] = tabb[i + nb - n];
-        if (mp_add_ui(tabt, 1, n)) {
-            /* tabt = B^n : tabb_inv = B^n */
-            memset(tabb_inv, 0, n * sizeof(limb_t));
-            tabb_inv[n] = 1;
-            goto recip_done;
-        }
-    }
-    if (mp_recip(s, tabb_inv, tabt, n))
-        goto fail;
- recip_done:
-    /* Q=A*B^-1 */
-    if (mp_mul(s, tabt, tabb_inv, n + 1, taba + na - (n + 1), n + 1))
-        goto fail;
-
-    for(i = 0; i < nq + 1; i++)
-        tabq[i] = tabt[i + 2 * (n + 1) - (nq + 1)];
-#ifdef DEBUG_DIVNORM_LARGE
-    mp_print_str("q", tabq, nq + 1);
-#endif
-
-    bf_free(s, tabt);
-    bf_free(s, tabb_inv);
-    tabb_inv = NULL;
-
-    /* R=A-B*Q */
-    tabt = bf_malloc(s, sizeof(limb_t) * (na + 1));
-    if (!tabt)
-        goto fail;
-    if (mp_mul(s, tabt, tabq, nq + 1, tabb, nb))
-        goto fail;
-    /* we add one more limb for the result */
-    mp_sub(taba, taba, tabt, nb + 1, 0);
-    bf_free(s, tabt);
-    /* the approximated quotient is smaller than than the exact one,
-       hence we may have to increment it */
-#ifdef DEBUG_DIVNORM_LARGE2
-    int cnt = 0;
-    static int cnt_max;
-#endif
-    for(;;) {
-        if (taba[nb] == 0 && mp_cmp(taba, tabb, nb) < 0)
-            break;
-        taba[nb] -= mp_sub(taba, taba, tabb, nb, 0);
-        mp_add_ui(tabq, 1, nq + 1);
-#ifdef DEBUG_DIVNORM_LARGE2
-        cnt++;
-#endif
-    }
-#ifdef DEBUG_DIVNORM_LARGE2
-    if (cnt > cnt_max) {
-        cnt_max = cnt;
-        printf("\ncnt=%d nq=%d nb=%d\n", cnt_max, (int)nq, (int)nb);
-    }
-#endif
-    return 0;
- fail:
-    bf_free(s, tabb_inv);
-    bf_free(s, tabt);
-    return -1;
-}
-
-int bf_mul(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-           bf_flags_t flags)
-{
-    int ret, r_sign;
-
-    if (a->len < b->len) {
-        const bf_t *tmp = a;
-        a = b;
-        b = tmp;
-    }
-    r_sign = a->sign ^ b->sign;
-    /* here b->len <= a->len */
-    if (b->len == 0) {
-        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            ret = 0;
-        } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) {
-            if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) ||
-                (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) {
-                bf_set_nan(r);
-                ret = BF_ST_INVALID_OP;
-            } else {
-                bf_set_inf(r, r_sign);
-                ret = 0;
-            }
-        } else {
-            bf_set_zero(r, r_sign);
-            ret = 0;
-        }
-    } else {
-        bf_t tmp, *r1 = NULL;
-        limb_t a_len, b_len, precl;
-        limb_t *a_tab, *b_tab;
-
-        a_len = a->len;
-        b_len = b->len;
-
-        if ((flags & BF_RND_MASK) == BF_RNDF) {
-            /* faithful rounding does not require using the full inputs */
-            precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS;
-            a_len = bf_min(a_len, precl);
-            b_len = bf_min(b_len, precl);
-        }
-        a_tab = a->tab + a->len - a_len;
-        b_tab = b->tab + b->len - b_len;
-
-#ifdef USE_FFT_MUL
-        if (b_len >= FFT_MUL_THRESHOLD) {
-            int mul_flags = 0;
-            if (r == a)
-                mul_flags |= FFT_MUL_R_OVERLAP_A;
-            if (r == b)
-                mul_flags |= FFT_MUL_R_OVERLAP_B;
-            if (fft_mul(r->ctx, r, a_tab, a_len, b_tab, b_len, mul_flags))
-                goto fail;
-        } else
-#endif
-        {
-            if (r == a || r == b) {
-                bf_init(r->ctx, &tmp);
-                r1 = r;
-                r = &tmp;
-            }
-            if (bf_resize(r, a_len + b_len)) {
-            fail:
-                bf_set_nan(r);
-                ret = BF_ST_MEM_ERROR;
-                goto done;
-            }
-            mp_mul_basecase(r->tab, a_tab, a_len, b_tab, b_len);
-        }
-        r->sign = r_sign;
-        r->expn = a->expn + b->expn;
-        ret = bf_normalize_and_round(r, prec, flags);
-    done:
-        if (r == &tmp)
-            bf_move(r1, &tmp);
-    }
-    return ret;
-}
-
-/* multiply 'r' by 2^e */
-int bf_mul_2exp(bf_t *r, slimb_t e, limb_t prec, bf_flags_t flags)
-{
-    slimb_t e_max;
-    if (r->len == 0)
-        return 0;
-    e_max = ((limb_t)1 << BF_EXT_EXP_BITS_MAX) - 1;
-    e = bf_max(e, -e_max);
-    e = bf_min(e, e_max);
-    r->expn += e;
-    return __bf_round(r, prec, flags, r->len, 0);
-}
-
-/* Return e such as a=m*2^e with m odd integer. return 0 if a is zero,
-   Infinite or Nan. */
-slimb_t bf_get_exp_min(const bf_t *a)
-{
-    slimb_t i;
-    limb_t v;
-    int k;
-
-    for(i = 0; i < a->len; i++) {
-        v = a->tab[i];
-        if (v != 0) {
-            k = ctz(v);
-            return a->expn - (a->len - i) * LIMB_BITS + k;
-        }
-    }
-    return 0;
-}
-
-/* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the
-   integer defined as floor(a/b) and r = a - q * b. */
-static void bf_tdivremu(bf_t *q, bf_t *r,
-                        const bf_t *a, const bf_t *b)
-{
-    if (bf_cmpu(a, b) < 0) {
-        bf_set_ui(q, 0);
-        bf_set(r, a);
-    } else {
-        bf_div(q, a, b, bf_max(a->expn - b->expn + 1, 2), BF_RNDZ);
-        bf_rint(q, BF_RNDZ);
-        bf_mul(r, q, b, BF_PREC_INF, BF_RNDZ);
-        bf_sub(r, a, r, BF_PREC_INF, BF_RNDZ);
-    }
-}
-
-static int __bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-                    bf_flags_t flags)
-{
-    bf_context_t *s = r->ctx;
-    int ret, r_sign;
-    limb_t n, nb, precl;
-
-    r_sign = a->sign ^ b->sign;
-    if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) {
-        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else if (a->expn == BF_EXP_INF) {
-            bf_set_inf(r, r_sign);
-            return 0;
-        } else {
-            bf_set_zero(r, r_sign);
-            return 0;
-        }
-    } else if (a->expn == BF_EXP_ZERO) {
-        if (b->expn == BF_EXP_ZERO) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bf_set_zero(r, r_sign);
-            return 0;
-        }
-    } else if (b->expn == BF_EXP_ZERO) {
-        bf_set_inf(r, r_sign);
-        return BF_ST_DIVIDE_ZERO;
-    }
-
-    /* number of limbs of the quotient (2 extra bits for rounding) */
-    precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS;
-    nb = b->len;
-    n = bf_max(a->len, precl);
-
-    {
-        limb_t *taba, na;
-        slimb_t d;
-
-        na = n + nb;
-
-#if LIMB_LOG2_BITS == 6
-        if (na >= (SIZE_MAX / sizeof(limb_t)) - 1) {
-            return BF_ST_MEM_ERROR;  /* Return memory error status */
-        }
-#endif
-
-        taba = bf_malloc(s, (na + 1) * sizeof(limb_t));
-        if (!taba)
-            goto fail;
-        d = na - a->len;
-        memset(taba, 0, d * sizeof(limb_t));
-        memcpy(taba + d, a->tab, a->len * sizeof(limb_t));
-        if (bf_resize(r, n + 1))
-            goto fail1;
-        if (mp_divnorm(s, r->tab, taba, na, b->tab, nb)) {
-        fail1:
-            bf_free(s, taba);
-            goto fail;
-        }
-        /* see if non zero remainder */
-        if (mp_scan_nz(taba, nb))
-            r->tab[0] |= 1;
-        bf_free(r->ctx, taba);
-        r->expn = a->expn - b->expn + LIMB_BITS;
-        r->sign = r_sign;
-        ret = bf_normalize_and_round(r, prec, flags);
-    }
-    return ret;
- fail:
-    bf_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-/* division and remainder.
-
-   rnd_mode is the rounding mode for the quotient. The additional
-   rounding mode BF_RND_EUCLIDIAN is supported.
-
-   'q' is an integer. 'r' is rounded with prec and flags (prec can be
-   BF_PREC_INF).
-*/
-int bf_divrem(bf_t *q, bf_t *r, const bf_t *a, const bf_t *b,
-              limb_t prec, bf_flags_t flags, int rnd_mode)
-{
-    bf_t a1_s, *a1 = &a1_s;
-    bf_t b1_s, *b1 = &b1_s;
-    int q_sign, ret;
-    bool is_ceil, is_rndn;
-
-    assert(q != a && q != b);
-    assert(r != a && r != b);
-    assert(q != r);
-
-    if (a->len == 0 || b->len == 0) {
-        bf_set_zero(q, 0);
-        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bf_set(r, a);
-            return bf_round(r, prec, flags);
-        }
-    }
-
-    q_sign = a->sign ^ b->sign;
-    is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA);
-    switch(rnd_mode) {
-    default:
-    case BF_RNDZ:
-    case BF_RNDN:
-    case BF_RNDNA:
-        is_ceil = false;
-        break;
-    case BF_RNDD:
-        is_ceil = q_sign;
-        break;
-    case BF_RNDU:
-        is_ceil = q_sign ^ 1;
-        break;
-    case BF_RNDA:
-        is_ceil = true;
-        break;
-    case BF_DIVREM_EUCLIDIAN:
-        is_ceil = a->sign;
-        break;
-    }
-
-    a1->expn = a->expn;
-    a1->tab = a->tab;
-    a1->len = a->len;
-    a1->sign = 0;
-
-    b1->expn = b->expn;
-    b1->tab = b->tab;
-    b1->len = b->len;
-    b1->sign = 0;
-
-    /* XXX: could improve to avoid having a large 'q' */
-    bf_tdivremu(q, r, a1, b1);
-    if (bf_is_nan(q) || bf_is_nan(r))
-        goto fail;
-
-    if (r->len != 0) {
-        if (is_rndn) {
-            int res;
-            b1->expn--;
-            res = bf_cmpu(r, b1);
-            b1->expn++;
-            if (res > 0 ||
-                (res == 0 &&
-                 (rnd_mode == BF_RNDNA ||
-                  get_bit(q->tab, q->len, q->len * LIMB_BITS - q->expn)))) {
-                goto do_sub_r;
-            }
-        } else if (is_ceil) {
-        do_sub_r:
-            ret = bf_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ);
-            ret |= bf_sub(r, r, b1, BF_PREC_INF, BF_RNDZ);
-            if (ret & BF_ST_MEM_ERROR)
-                goto fail;
-        }
-    }
-
-    r->sign ^= a->sign;
-    q->sign = q_sign;
-    return bf_round(r, prec, flags);
- fail:
-    bf_set_nan(q);
-    bf_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-int bf_rem(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-           bf_flags_t flags, int rnd_mode)
-{
-    bf_t q_s, *q = &q_s;
-    int ret;
-
-    bf_init(r->ctx, q);
-    ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode);
-    bf_delete(q);
-    return ret;
-}
-
-static inline int bf_get_limb(slimb_t *pres, const bf_t *a, int flags)
-{
-#if LIMB_BITS == 32
-    return bf_get_int32(pres, a, flags);
-#else
-    return bf_get_int64(pres, a, flags);
-#endif
-}
-
-int bf_remquo(slimb_t *pq, bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-              bf_flags_t flags, int rnd_mode)
-{
-    bf_t q_s, *q = &q_s;
-    int ret;
-
-    bf_init(r->ctx, q);
-    ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode);
-    bf_get_limb(pq, q, BF_GET_INT_MOD);
-    bf_delete(q);
-    return ret;
-}
-
-static __maybe_unused inline limb_t mul_mod(limb_t a, limb_t b, limb_t m)
-{
-    dlimb_t t;
-    t = (dlimb_t)a * (dlimb_t)b;
-    return t % m;
-}
-
-#if defined(USE_MUL_CHECK)
-static limb_t mp_mod1(const limb_t *tab, limb_t n, limb_t m, limb_t r)
-{
-    slimb_t i;
-    dlimb_t t;
-
-    for(i = n - 1; i >= 0; i--) {
-        t = ((dlimb_t)r << LIMB_BITS) | tab[i];
-        r = t % m;
-    }
-    return r;
-}
-#endif
-
-static const uint16_t sqrt_table[192] = {
-128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221,221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230,230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238,239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255,
-};
-
-/* a >= 2^(LIMB_BITS - 2).  Return (s, r) with s=floor(sqrt(a)) and
-   r=a-s^2. 0 <= r <= 2 * s */
-static limb_t mp_sqrtrem1(limb_t *pr, limb_t a)
-{
-    limb_t s1, r1, s, r, q, u, num;
-
-    /* use a table for the 16 -> 8 bit sqrt */
-    s1 = sqrt_table[(a >> (LIMB_BITS - 8)) - 64];
-    r1 = (a >> (LIMB_BITS - 16)) - s1 * s1;
-    if (r1 > 2 * s1) {
-        r1 -= 2 * s1 + 1;
-        s1++;
-    }
-
-    /* one iteration to get a 32 -> 16 bit sqrt */
-    num = (r1 << 8) | ((a >> (LIMB_BITS - 32 + 8)) & 0xff);
-    q = num / (2 * s1); /* q <= 2^8 */
-    u = num % (2 * s1);
-    s = (s1 << 8) + q;
-    r = (u << 8) | ((a >> (LIMB_BITS - 32)) & 0xff);
-    r -= q * q;
-    if ((slimb_t)r < 0) {
-        s--;
-        r += 2 * s + 1;
-    }
-
-#if LIMB_BITS == 64
-    s1 = s;
-    r1 = r;
-    /* one more iteration for 64 -> 32 bit sqrt */
-    num = (r1 << 16) | ((a >> (LIMB_BITS - 64 + 16)) & 0xffff);
-    q = num / (2 * s1); /* q <= 2^16 */
-    u = num % (2 * s1);
-    s = (s1 << 16) + q;
-    r = (u << 16) | ((a >> (LIMB_BITS - 64)) & 0xffff);
-    r -= q * q;
-    if ((slimb_t)r < 0) {
-        s--;
-        r += 2 * s + 1;
-    }
-#endif
-    *pr = r;
-    return s;
-}
-
-/* return floor(sqrt(a)) */
-limb_t bf_isqrt(limb_t a)
-{
-    limb_t s, r;
-    int k;
-
-    if (a == 0)
-        return 0;
-    k = clz(a) & ~1;
-    s = mp_sqrtrem1(&r, a << k);
-    s >>= (k >> 1);
-    return s;
-}
-
-static limb_t mp_sqrtrem2(limb_t *tabs, limb_t *taba)
-{
-    limb_t s1, r1, s, q, u, a0, a1;
-    dlimb_t r, num;
-    int l;
-
-    a0 = taba[0];
-    a1 = taba[1];
-    s1 = mp_sqrtrem1(&r1, a1);
-    l = LIMB_BITS / 2;
-    num = ((dlimb_t)r1 << l) | (a0 >> l);
-    q = num / (2 * s1);
-    u = num % (2 * s1);
-    s = (s1 << l) + q;
-    r = ((dlimb_t)u << l) | (a0 & (((limb_t)1 << l) - 1));
-    if (unlikely((q >> l) != 0))
-        r -= (dlimb_t)1 << LIMB_BITS; /* special case when q=2^l */
-    else
-        r -= q * q;
-    if ((slimb_t)(r >> LIMB_BITS) < 0) {
-        s--;
-        r += 2 * (dlimb_t)s + 1;
-    }
-    tabs[0] = s;
-    taba[0] = r;
-    return r >> LIMB_BITS;
-}
-
-//#define DEBUG_SQRTREM
-
-/* tmp_buf must contain (n / 2 + 1 limbs). *prh contains the highest
-   limb of the remainder. */
-static int mp_sqrtrem_rec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n,
-                          limb_t *tmp_buf, limb_t *prh)
-{
-    limb_t l, h, rh, ql, qh, c, i;
-
-    if (n == 1) {
-        *prh = mp_sqrtrem2(tabs, taba);
-        return 0;
-    }
-#ifdef DEBUG_SQRTREM
-    mp_print_str("a", taba, 2 * n);
-#endif
-    l = n / 2;
-    h = n - l;
-    if (mp_sqrtrem_rec(s, tabs + l, taba + 2 * l, h, tmp_buf, &qh))
-        return -1;
-#ifdef DEBUG_SQRTREM
-    mp_print_str("s1", tabs + l, h);
-    mp_print_str_h("r1", taba + 2 * l, h, qh);
-    mp_print_str_h("r2", taba + l, n, qh);
-#endif
-
-    /* the remainder is in taba + 2 * l. Its high bit is in qh */
-    if (qh) {
-        mp_sub(taba + 2 * l, taba + 2 * l, tabs + l, h, 0);
-    }
-    /* instead of dividing by 2*s, divide by s (which is normalized)
-       and update q and r */
-    if (mp_divnorm(s, tmp_buf, taba + l, n, tabs + l, h))
-        return -1;
-    qh += tmp_buf[l];
-    for(i = 0; i < l; i++)
-        tabs[i] = tmp_buf[i];
-    ql = mp_shr(tabs, tabs, l, 1, qh & 1);
-    qh = qh >> 1; /* 0 or 1 */
-    if (ql)
-        rh = mp_add(taba + l, taba + l, tabs + l, h, 0);
-    else
-        rh = 0;
-#ifdef DEBUG_SQRTREM
-    mp_print_str_h("q", tabs, l, qh);
-    mp_print_str_h("u", taba + l, h, rh);
-#endif
-
-    mp_add_ui(tabs + l, qh, h);
-#ifdef DEBUG_SQRTREM
-    mp_print_str_h("s2", tabs, n, sh);
-#endif
-
-    /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */
-    /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */
-    if (qh) {
-        c = qh;
-    } else {
-        if (mp_mul(s, taba + n, tabs, l, tabs, l))
-            return -1;
-        c = mp_sub(taba, taba, taba + n, 2 * l, 0);
-    }
-    rh -= mp_sub_ui(taba + 2 * l, c, n - 2 * l);
-    if ((slimb_t)rh < 0) {
-        mp_sub_ui(tabs, 1, n);
-        rh += mp_add_mul1(taba, tabs, n, 2);
-        rh += mp_add_ui(taba, 1, n);
-    }
-    *prh = rh;
-    return 0;
-}
-
-/* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= 2 ^ (LIMB_BITS
-   - 2). Return (s, r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2
-   * s. tabs has n limbs. r is returned in the lower n limbs of
-   taba. Its r[n] is the returned value of the function. */
-/* Algorithm from the article "Karatsuba Square Root" by Paul Zimmermann and
-   inspirated from its GMP implementation */
-int mp_sqrtrem(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n)
-{
-    limb_t tmp_buf1[8];
-    limb_t *tmp_buf;
-    mp_size_t n2;
-    int ret;
-    n2 = n / 2 + 1;
-    if (n2 <= countof(tmp_buf1)) {
-        tmp_buf = tmp_buf1;
-    } else {
-        tmp_buf = bf_malloc(s, sizeof(limb_t) * n2);
-        if (!tmp_buf)
-            return -1;
-    }
-    ret = mp_sqrtrem_rec(s, tabs, taba, n, tmp_buf, taba + n);
-    if (tmp_buf != tmp_buf1)
-        bf_free(s, tmp_buf);
-    return ret;
-}
-
-/* Integer square root with remainder. 'a' must be an integer. r =
-   floor(sqrt(a)) and rem = a - r^2.  BF_ST_INEXACT is set if the result
-   is inexact. 'rem' can be NULL if the remainder is not needed. */
-int bf_sqrtrem(bf_t *r, bf_t *rem1, const bf_t *a)
-{
-    int ret;
-
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-        } else if (a->expn == BF_EXP_INF && a->sign) {
-            goto invalid_op;
-        } else {
-            bf_set(r, a);
-        }
-        if (rem1)
-            bf_set_ui(rem1, 0);
-        ret = 0;
-    } else if (a->sign) {
- invalid_op:
-        bf_set_nan(r);
-        if (rem1)
-            bf_set_ui(rem1, 0);
-        ret = BF_ST_INVALID_OP;
-    } else {
-        bf_t rem_s, *rem;
-
-        bf_sqrt(r, a, (a->expn + 1) / 2, BF_RNDZ);
-        bf_rint(r, BF_RNDZ);
-        /* see if the result is exact by computing the remainder */
-        if (rem1) {
-            rem = rem1;
-        } else {
-            rem = &rem_s;
-            bf_init(r->ctx, rem);
-        }
-        /* XXX: could avoid recomputing the remainder */
-        bf_mul(rem, r, r, BF_PREC_INF, BF_RNDZ);
-        bf_neg(rem);
-        bf_add(rem, rem, a, BF_PREC_INF, BF_RNDZ);
-        if (bf_is_nan(rem)) {
-            ret = BF_ST_MEM_ERROR;
-            goto done;
-        }
-        if (rem->len != 0) {
-            ret = BF_ST_INEXACT;
-        } else {
-            ret = 0;
-        }
-    done:
-        if (!rem1)
-            bf_delete(rem);
-    }
-    return ret;
-}
-
-int bf_sqrt(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = a->ctx;
-    int ret;
-
-    assert(r != a);
-
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-        } else if (a->expn == BF_EXP_INF && a->sign) {
-            goto invalid_op;
-        } else {
-            bf_set(r, a);
-        }
-        ret = 0;
-    } else if (a->sign) {
- invalid_op:
-        bf_set_nan(r);
-        ret = BF_ST_INVALID_OP;
-    } else {
-        limb_t *a1;
-        slimb_t n, n1;
-        limb_t res;
-
-        /* convert the mantissa to an integer with at least 2 *
-           prec + 4 bits */
-        n = (2 * (prec + 2) + 2 * LIMB_BITS - 1) / (2 * LIMB_BITS);
-        if (bf_resize(r, n))
-            goto fail;
-        a1 = bf_malloc(s, sizeof(limb_t) * 2 * n);
-        if (!a1)
-            goto fail;
-        n1 = bf_min(2 * n, a->len);
-        memset(a1, 0, (2 * n - n1) * sizeof(limb_t));
-        memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t));
-        if (a->expn & 1) {
-            res = mp_shr(a1, a1, 2 * n, 1, 0);
-        } else {
-            res = 0;
-        }
-        if (mp_sqrtrem(s, r->tab, a1, n)) {
-            bf_free(s, a1);
-            goto fail;
-        }
-        if (!res) {
-            res = mp_scan_nz(a1, n + 1);
-        }
-        bf_free(s, a1);
-        if (!res) {
-            res = mp_scan_nz(a->tab, a->len - n1);
-        }
-        if (res != 0)
-            r->tab[0] |= 1;
-        r->sign = 0;
-        r->expn = (a->expn + 1) >> 1;
-        ret = bf_round(r, prec, flags);
-    }
-    return ret;
- fail:
-    bf_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-static no_inline int bf_op2(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-                            bf_flags_t flags, bf_op2_func_t *func)
-{
-    bf_t tmp;
-    int ret;
-
-    if (r == a || r == b) {
-        bf_init(r->ctx, &tmp);
-        ret = func(&tmp, a, b, prec, flags);
-        bf_move(r, &tmp);
-    } else {
-        ret = func(r, a, b, prec, flags);
-    }
-    return ret;
-}
-
-int bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-            bf_flags_t flags)
-{
-    return bf_op2(r, a, b, prec, flags, __bf_add);
-}
-
-int bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-            bf_flags_t flags)
-{
-    return bf_op2(r, a, b, prec, flags, __bf_sub);
-}
-
-int bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-           bf_flags_t flags)
-{
-    return bf_op2(r, a, b, prec, flags, __bf_div);
-}
-
-int bf_mul_ui(bf_t *r, const bf_t *a, uint64_t b1, limb_t prec,
-               bf_flags_t flags)
-{
-    bf_t b;
-    int ret;
-    bf_init(r->ctx, &b);
-    ret = bf_set_ui(&b, b1);
-    ret |= bf_mul(r, a, &b, prec, flags);
-    bf_delete(&b);
-    return ret;
-}
-
-int bf_mul_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec,
-               bf_flags_t flags)
-{
-    bf_t b;
-    int ret;
-    bf_init(r->ctx, &b);
-    ret = bf_set_si(&b, b1);
-    ret |= bf_mul(r, a, &b, prec, flags);
-    bf_delete(&b);
-    return ret;
-}
-
-int bf_add_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec,
-              bf_flags_t flags)
-{
-    bf_t b;
-    int ret;
-
-    bf_init(r->ctx, &b);
-    ret = bf_set_si(&b, b1);
-    ret |= bf_add(r, a, &b, prec, flags);
-    bf_delete(&b);
-    return ret;
-}
-
-static int bf_pow_ui(bf_t *r, const bf_t *a, limb_t b, limb_t prec,
-                     bf_flags_t flags)
-{
-    int ret, n_bits, i;
-
-    assert(r != a);
-    if (b == 0)
-        return bf_set_ui(r, 1);
-    ret = bf_set(r, a);
-    n_bits = LIMB_BITS - clz(b);
-    for(i = n_bits - 2; i >= 0; i--) {
-        ret |= bf_mul(r, r, r, prec, flags);
-        if ((b >> i) & 1)
-            ret |= bf_mul(r, r, a, prec, flags);
-    }
-    return ret;
-}
-
-static int bf_pow_ui_ui(bf_t *r, limb_t a1, limb_t b,
-                        limb_t prec, bf_flags_t flags)
-{
-    bf_t a;
-    int ret;
-
-    if (a1 == 10 && b <= LIMB_DIGITS) {
-        /* use precomputed powers. We do not round at this point
-           because we expect the caller to do it */
-        ret = bf_set_ui(r, mp_pow_dec[b]);
-    } else {
-        bf_init(r->ctx, &a);
-        ret = bf_set_ui(&a, a1);
-        ret |= bf_pow_ui(r, &a, b, prec, flags);
-        bf_delete(&a);
-    }
-    return ret;
-}
-
-/* convert to integer (infinite precision) */
-int bf_rint(bf_t *r, int rnd_mode)
-{
-    return bf_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC);
-}
-
-/* logical operations */
-#define BF_LOGIC_OR  0
-#define BF_LOGIC_XOR 1
-#define BF_LOGIC_AND 2
-
-static inline limb_t bf_logic_op1(limb_t a, limb_t b, int op)
-{
-    switch(op) {
-    case BF_LOGIC_OR:
-        return a | b;
-    case BF_LOGIC_XOR:
-        return a ^ b;
-    default:
-    case BF_LOGIC_AND:
-        return a & b;
-    }
-}
-
-static int bf_logic_op(bf_t *r, const bf_t *a1, const bf_t *b1, int op)
-{
-    bf_t b1_s, a1_s, *a, *b;
-    limb_t a_sign, b_sign, r_sign;
-    slimb_t l, i, a_bit_offset, b_bit_offset;
-    limb_t v1, v2, v1_mask, v2_mask, r_mask;
-    int ret;
-
-    assert(r != a1 && r != b1);
-
-    if (a1->expn <= 0)
-        a_sign = 0; /* minus zero is considered as positive */
-    else
-        a_sign = a1->sign;
-
-    if (b1->expn <= 0)
-        b_sign = 0; /* minus zero is considered as positive */
-    else
-        b_sign = b1->sign;
-
-    if (a_sign) {
-        a = &a1_s;
-        bf_init(r->ctx, a);
-        if (bf_add_si(a, a1, 1, BF_PREC_INF, BF_RNDZ)) {
-            b = NULL;
-            goto fail;
-        }
-    } else {
-        a = (bf_t *)a1;
-    }
-
-    if (b_sign) {
-        b = &b1_s;
-        bf_init(r->ctx, b);
-        if (bf_add_si(b, b1, 1, BF_PREC_INF, BF_RNDZ))
-            goto fail;
-    } else {
-        b = (bf_t *)b1;
-    }
-
-    r_sign = bf_logic_op1(a_sign, b_sign, op);
-    if (op == BF_LOGIC_AND && r_sign == 0) {
-        /* no need to compute extra zeros for and */
-        if (a_sign == 0 && b_sign == 0)
-            l = bf_min(a->expn, b->expn);
-        else if (a_sign == 0)
-            l = a->expn;
-        else
-            l = b->expn;
-    } else {
-        l = bf_max(a->expn, b->expn);
-    }
-    /* Note: a or b can be zero */
-    l = (bf_max(l, 1) + LIMB_BITS - 1) / LIMB_BITS;
-    if (bf_resize(r, l))
-        goto fail;
-    a_bit_offset = a->len * LIMB_BITS - a->expn;
-    b_bit_offset = b->len * LIMB_BITS - b->expn;
-    v1_mask = -a_sign;
-    v2_mask = -b_sign;
-    r_mask = -r_sign;
-    for(i = 0; i < l; i++) {
-        v1 = get_bits(a->tab, a->len, a_bit_offset + i * LIMB_BITS) ^ v1_mask;
-        v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS) ^ v2_mask;
-        r->tab[i] = bf_logic_op1(v1, v2, op) ^ r_mask;
-    }
-    r->expn = l * LIMB_BITS;
-    r->sign = r_sign;
-    bf_normalize_and_round(r, BF_PREC_INF, BF_RNDZ); /* cannot fail */
-    if (r_sign) {
-        if (bf_add_si(r, r, -1, BF_PREC_INF, BF_RNDZ))
-            goto fail;
-    }
-    ret = 0;
- done:
-    if (a == &a1_s)
-        bf_delete(a);
-    if (b == &b1_s)
-        bf_delete(b);
-    return ret;
- fail:
-    bf_set_nan(r);
-    ret = BF_ST_MEM_ERROR;
-    goto done;
-}
-
-/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */
-int bf_logic_or(bf_t *r, const bf_t *a, const bf_t *b)
-{
-    return bf_logic_op(r, a, b, BF_LOGIC_OR);
-}
-
-/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */
-int bf_logic_xor(bf_t *r, const bf_t *a, const bf_t *b)
-{
-    return bf_logic_op(r, a, b, BF_LOGIC_XOR);
-}
-
-/* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */
-int bf_logic_and(bf_t *r, const bf_t *a, const bf_t *b)
-{
-    return bf_logic_op(r, a, b, BF_LOGIC_AND);
-}
-
-/* conversion between fixed size types */
-
-typedef union {
-    double d;
-    uint64_t u;
-} Float64Union;
-
-int bf_get_float64(const bf_t *a, double *pres, bf_rnd_t rnd_mode)
-{
-    Float64Union u;
-    int e, ret;
-    uint64_t m;
-
-    ret = 0;
-    if (a->expn == BF_EXP_NAN) {
-        u.u = 0x7ff8000000000000; /* quiet nan */
-    } else {
-        bf_t b_s, *b = &b_s;
-
-        bf_init(a->ctx, b);
-        bf_set(b, a);
-        if (bf_is_finite(b)) {
-            ret = bf_round(b, 53, rnd_mode | BF_FLAG_SUBNORMAL | bf_set_exp_bits(11));
-        }
-        if (b->expn == BF_EXP_INF) {
-            e = (1 << 11) - 1;
-            m = 0;
-        } else if (b->expn == BF_EXP_ZERO) {
-            e = 0;
-            m = 0;
-        } else {
-            e = b->expn + 1023 - 1;
-#if LIMB_BITS == 32
-            if (b->len == 2) {
-                m = ((uint64_t)b->tab[1] << 32) | b->tab[0];
-            } else {
-                m = ((uint64_t)b->tab[0] << 32);
-            }
-#else
-            m = b->tab[0];
-#endif
-            if (e <= 0) {
-                /* subnormal */
-                m = m >> (12 - e);
-                e = 0;
-            } else {
-                m = (m << 1) >> 12;
-            }
-        }
-        u.u = m | ((uint64_t)e << 52) | ((uint64_t)b->sign << 63);
-        bf_delete(b);
-    }
-    *pres = u.d;
-    return ret;
-}
-
-int bf_set_float64(bf_t *a, double d)
-{
-    Float64Union u;
-    uint64_t m;
-    int shift, e, sgn;
-
-    u.d = d;
-    sgn = u.u >> 63;
-    e = (u.u >> 52) & ((1 << 11) - 1);
-    m = u.u & (((uint64_t)1 << 52) - 1);
-    if (e == ((1 << 11) - 1)) {
-        if (m != 0) {
-            bf_set_nan(a);
-        } else {
-            bf_set_inf(a, sgn);
-        }
-    } else if (e == 0) {
-        if (m == 0) {
-            bf_set_zero(a, sgn);
-        } else {
-            /* subnormal number */
-            m <<= 12;
-            shift = clz64(m);
-            m <<= shift;
-            e = -shift;
-            goto norm;
-        }
-    } else {
-        m = (m << 11) | ((uint64_t)1 << 63);
-    norm:
-        a->expn = e - 1023 + 1;
-#if LIMB_BITS == 32
-        if (bf_resize(a, 2))
-            goto fail;
-        a->tab[0] = m;
-        a->tab[1] = m >> 32;
-#else
-        if (bf_resize(a, 1))
-            goto fail;
-        a->tab[0] = m;
-#endif
-        a->sign = sgn;
-    }
-    return 0;
-fail:
-    bf_set_nan(a);
-    return BF_ST_MEM_ERROR;
-}
-
-/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there
-   is an overflow and 0 otherwise. */
-int bf_get_int32(int *pres, const bf_t *a, int flags)
-{
-    uint32_t v;
-    int ret;
-    if (a->expn >= BF_EXP_INF) {
-        ret = BF_ST_INVALID_OP;
-        if (flags & BF_GET_INT_MOD) {
-            v = 0;
-        } else if (a->expn == BF_EXP_INF) {
-            v = (uint32_t)INT32_MAX + a->sign;
-        } else {
-            v = INT32_MAX;
-        }
-    } else if (a->expn <= 0) {
-        v = 0;
-        ret = 0;
-    } else if (a->expn <= 31) {
-        v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
-        if (a->sign)
-            v = -v;
-        ret = 0;
-    } else if (!(flags & BF_GET_INT_MOD)) {
-        ret = BF_ST_INVALID_OP;
-        if (a->sign) {
-            v = (uint32_t)INT32_MAX + 1;
-            if (a->expn == 32 &&
-                (a->tab[a->len - 1] >> (LIMB_BITS - 32)) == v) {
-                ret = 0;
-            }
-        } else {
-            v = INT32_MAX;
-        }
-    } else {
-        v = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn);
-        if (a->sign)
-            v = -v;
-        ret = 0;
-    }
-    *pres = v;
-    return ret;
-}
-
-/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there
-   is an overflow and 0 otherwise. */
-int bf_get_int64(int64_t *pres, const bf_t *a, int flags)
-{
-    uint64_t v;
-    int ret;
-    if (a->expn >= BF_EXP_INF) {
-        ret = BF_ST_INVALID_OP;
-        if (flags & BF_GET_INT_MOD) {
-            v = 0;
-        } else if (a->expn == BF_EXP_INF) {
-            v = (uint64_t)INT64_MAX + a->sign;
-        } else {
-            v = INT64_MAX;
-        }
-    } else if (a->expn <= 0) {
-        v = 0;
-        ret = 0;
-    } else if (a->expn <= 63) {
-#if LIMB_BITS == 32
-        if (a->expn <= 32)
-            v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
-        else
-            v = (((uint64_t)a->tab[a->len - 1] << 32) |
-                 get_limbz(a, a->len - 2)) >> (64 - a->expn);
-#else
-        v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
-#endif
-        if (a->sign)
-            v = -v;
-        ret = 0;
-    } else if (!(flags & BF_GET_INT_MOD)) {
-        ret = BF_ST_INVALID_OP;
-        if (a->sign) {
-            uint64_t v1;
-            v = (uint64_t)INT64_MAX + 1;
-            if (a->expn == 64) {
-                v1 = a->tab[a->len - 1];
-#if LIMB_BITS == 32
-                v1 = (v1 << 32) | get_limbz(a, a->len - 2);
-#endif
-                if (v1 == v)
-                    ret = 0;
-            }
-        } else {
-            v = INT64_MAX;
-        }
-    } else {
-        slimb_t bit_pos = a->len * LIMB_BITS - a->expn;
-        v = get_bits(a->tab, a->len, bit_pos);
-#if LIMB_BITS == 32
-        v |= (uint64_t)get_bits(a->tab, a->len, bit_pos + 32) << 32;
-#endif
-        if (a->sign)
-            v = -v;
-        ret = 0;
-    }
-    *pres = v;
-    return ret;
-}
-
-/* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there
-   is an overflow and 0 otherwise. */
-int bf_get_uint64(uint64_t *pres, const bf_t *a)
-{
-    uint64_t v;
-    int ret;
-    if (a->expn == BF_EXP_NAN) {
-        goto overflow;
-    } else if (a->expn <= 0) {
-        v = 0;
-        ret = 0;
-    } else if (a->sign) {
-        v = 0;
-        ret = BF_ST_INVALID_OP;
-    } else if (a->expn <= 64) {
-#if LIMB_BITS == 32
-        if (a->expn <= 32)
-            v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
-        else
-            v = (((uint64_t)a->tab[a->len - 1] << 32) |
-                 get_limbz(a, a->len - 2)) >> (64 - a->expn);
-#else
-        v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn);
-#endif
-        ret = 0;
-    } else {
-    overflow:
-        v = UINT64_MAX;
-        ret = BF_ST_INVALID_OP;
-    }
-    *pres = v;
-    return ret;
-}
-
-/* base conversion from radix */
-
-static const uint8_t digits_per_limb_table[BF_RADIX_MAX - 1] = {
-#if LIMB_BITS == 32
-32,20,16,13,12,11,10,10, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-#else
-64,40,32,27,24,22,21,20,19,18,17,17,16,16,16,15,15,15,14,14,14,14,13,13,13,13,13,13,13,12,12,12,12,12,12,
-#endif
-};
-
-static limb_t get_limb_radix(int radix)
-{
-    int i, k;
-    limb_t radixl;
-
-    k = digits_per_limb_table[radix - 2];
-    radixl = radix;
-    for(i = 1; i < k; i++)
-        radixl *= radix;
-    return radixl;
-}
-
-/* return != 0 if error */
-static int bf_integer_from_radix_rec(bf_t *r, const limb_t *tab,
-                                     limb_t n, int level, limb_t n0,
-                                     limb_t radix, bf_t *pow_tab)
-{
-    int ret;
-    if (n == 1) {
-        ret = bf_set_ui(r, tab[0]);
-    } else {
-        bf_t T_s, *T = &T_s, *B;
-        limb_t n1, n2;
-
-        n2 = (((n0 * 2) >> (level + 1)) + 1) / 2;
-        n1 = n - n2;
-        //        printf("level=%d n0=%ld n1=%ld n2=%ld\n", level, n0, n1, n2);
-        B = &pow_tab[level];
-        if (B->len == 0) {
-            ret = bf_pow_ui_ui(B, radix, n2, BF_PREC_INF, BF_RNDZ);
-            if (ret)
-                return ret;
-        }
-        ret = bf_integer_from_radix_rec(r, tab + n2, n1, level + 1, n0,
-                                        radix, pow_tab);
-        if (ret)
-            return ret;
-        ret = bf_mul(r, r, B, BF_PREC_INF, BF_RNDZ);
-        if (ret)
-            return ret;
-        bf_init(r->ctx, T);
-        ret = bf_integer_from_radix_rec(T, tab, n2, level + 1, n0,
-                                        radix, pow_tab);
-        if (!ret)
-            ret = bf_add(r, r, T, BF_PREC_INF, BF_RNDZ);
-        bf_delete(T);
-    }
-    return ret;
-    //    bf_print_str("  r=", r);
-}
-
-/* return 0 if OK != 0 if memory error */
-static int bf_integer_from_radix(bf_t *r, const limb_t *tab,
-                                 limb_t n, limb_t radix)
-{
-    bf_context_t *s = r->ctx;
-    int pow_tab_len, i, ret;
-    limb_t radixl;
-    bf_t *pow_tab;
-
-    radixl = get_limb_radix(radix);
-    pow_tab_len = ceil_log2(n) + 2; /* XXX: check */
-    pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len);
-    if (!pow_tab)
-        return -1;
-    for(i = 0; i < pow_tab_len; i++)
-        bf_init(r->ctx, &pow_tab[i]);
-    ret = bf_integer_from_radix_rec(r, tab, n, 0, n, radixl, pow_tab);
-    for(i = 0; i < pow_tab_len; i++) {
-        bf_delete(&pow_tab[i]);
-    }
-    bf_free(s, pow_tab);
-    return ret;
-}
-
-/* compute and round T * radix^expn. */
-int bf_mul_pow_radix(bf_t *r, const bf_t *T, limb_t radix,
-                     slimb_t expn, limb_t prec, bf_flags_t flags)
-{
-    int ret, expn_sign, overflow;
-    slimb_t e, extra_bits, prec1, ziv_extra_bits;
-    bf_t B_s, *B = &B_s;
-
-    if (T->len == 0) {
-        return bf_set(r, T);
-    } else if (expn == 0) {
-        ret = bf_set(r, T);
-        ret |= bf_round(r, prec, flags);
-        return ret;
-    }
-
-    e = expn;
-    expn_sign = 0;
-    if (e < 0) {
-        e = -e;
-        expn_sign = 1;
-    }
-    bf_init(r->ctx, B);
-    if (prec == BF_PREC_INF) {
-        /* infinite precision: only used if the result is known to be exact */
-        ret = bf_pow_ui_ui(B, radix, e, BF_PREC_INF, BF_RNDN);
-        if (expn_sign) {
-            ret |= bf_div(r, T, B, T->len * LIMB_BITS, BF_RNDN);
-        } else {
-            ret |= bf_mul(r, T, B, BF_PREC_INF, BF_RNDN);
-        }
-    } else {
-        ziv_extra_bits = 16;
-        for(;;) {
-            prec1 = prec + ziv_extra_bits;
-            /* XXX: correct overflow/underflow handling */
-            /* XXX: rigorous error analysis needed */
-            extra_bits = ceil_log2(e) * 2 + 1;
-            ret = bf_pow_ui_ui(B, radix, e, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP);
-            overflow = !bf_is_finite(B);
-            /* XXX: if bf_pow_ui_ui returns an exact result, can stop
-               after the next operation */
-            if (expn_sign)
-                ret |= bf_div(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP);
-            else
-                ret |= bf_mul(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP);
-            if (ret & BF_ST_MEM_ERROR)
-                break;
-            if ((ret & BF_ST_INEXACT) &&
-                !bf_can_round(r, prec, flags & BF_RND_MASK, prec1) &&
-                !overflow) {
-                /* and more precision and retry */
-                ziv_extra_bits = ziv_extra_bits  + (ziv_extra_bits / 2);
-            } else {
-                /* XXX: need to use __bf_round() to pass the inexact
-                   flag for the subnormal case */
-                ret = bf_round(r, prec, flags) | (ret & BF_ST_INEXACT);
-                break;
-            }
-        }
-    }
-    bf_delete(B);
-    return ret;
-}
-
-static inline int bf_to_digit(int c)
-{
-    if (c >= '0' && c <= '9')
-        return c - '0';
-    else if (c >= 'A' && c <= 'Z')
-        return c - 'A' + 10;
-    else if (c >= 'a' && c <= 'z')
-        return c - 'a' + 10;
-    else
-        return 36;
-}
-
-/* add a limb at 'pos' and decrement pos. new space is created if
-   needed. Return 0 if OK, -1 if memory error */
-static int bf_add_limb(bf_t *a, slimb_t *ppos, limb_t v)
-{
-    slimb_t pos;
-    pos = *ppos;
-    if (unlikely(pos < 0)) {
-        limb_t new_size, d, *new_tab;
-        new_size = bf_max(a->len + 1, a->len * 3 / 2);
-        new_tab = bf_realloc(a->ctx, a->tab, sizeof(limb_t) * new_size);
-        if (!new_tab)
-            return -1;
-        a->tab = new_tab;
-        d = new_size - a->len;
-        memmove(a->tab + d, a->tab, a->len * sizeof(limb_t));
-        a->len = new_size;
-        pos += d;
-    }
-    a->tab[pos--] = v;
-    *ppos = pos;
-    return 0;
-}
-
-static int bf_tolower(int c)
-{
-    if (c >= 'A' && c <= 'Z')
-        c = c - 'A' + 'a';
-    return c;
-}
-
-static int strcasestart(const char *str, const char *val, const char **ptr)
-{
-    const char *p, *q;
-    p = str;
-    q = val;
-    while (*q != '\0') {
-        if (bf_tolower(*p) != *q)
-            return 0;
-        p++;
-        q++;
-    }
-    if (ptr)
-        *ptr = p;
-    return 1;
-}
-
-static int bf_atof_internal(bf_t *r, slimb_t *pexponent,
-                            const char *str, const char **pnext, int radix,
-                            limb_t prec, bf_flags_t flags, bool is_dec)
-{
-    const char *p, *p_start;
-    int is_neg, radix_bits, exp_is_neg, ret, digits_per_limb, shift;
-    limb_t cur_limb;
-    slimb_t pos, expn, int_len, digit_count;
-    bool has_decpt, is_bin_exp;
-    bf_t a_s, *a;
-
-    *pexponent = 0;
-    p = str;
-    if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 &&
-        strcasestart(p, "nan", &p)) {
-        bf_set_nan(r);
-        ret = 0;
-        goto done;
-    }
-    is_neg = 0;
-
-    if (p[0] == '+') {
-        p++;
-        p_start = p;
-    } else if (p[0] == '-') {
-        is_neg = 1;
-        p++;
-        p_start = p;
-    } else {
-        p_start = p;
-    }
-    if (p[0] == '0') {
-        if ((p[1] == 'x' || p[1] == 'X') &&
-            (radix == 0 || radix == 16) &&
-            !(flags & BF_ATOF_NO_HEX)) {
-            radix = 16;
-            p += 2;
-        } else if ((p[1] == 'o' || p[1] == 'O') &&
-                   radix == 0 && (flags & BF_ATOF_BIN_OCT)) {
-            p += 2;
-            radix = 8;
-        } else if ((p[1] == 'b' || p[1] == 'B') &&
-                   radix == 0 && (flags & BF_ATOF_BIN_OCT)) {
-            p += 2;
-            radix = 2;
-        } else {
-            goto no_prefix;
-        }
-        /* there must be a digit after the prefix */
-        if (bf_to_digit((uint8_t)*p) >= radix) {
-            bf_set_nan(r);
-            ret = 0;
-            goto done;
-        }
-    no_prefix: ;
-    } else {
-        if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 &&
-            strcasestart(p, "inf", &p)) {
-            bf_set_inf(r, is_neg);
-            ret = 0;
-            goto done;
-        }
-    }
-
-    if (radix == 0)
-        radix = 10;
-    if (is_dec) {
-        assert(radix == 10);
-        radix_bits = 0;
-        a = r;
-    } else if ((radix & (radix - 1)) != 0) {
-        radix_bits = 0; /* base is not a power of two */
-        a = &a_s;
-        bf_init(r->ctx, a);
-    } else {
-        radix_bits = ceil_log2(radix);
-        a = r;
-    }
-
-    /* skip leading zeros */
-    /* XXX: could also skip zeros after the decimal point */
-    while (*p == '0')
-        p++;
-
-    if (radix_bits) {
-        shift = digits_per_limb = LIMB_BITS;
-    } else {
-        radix_bits = 0;
-        shift = digits_per_limb = digits_per_limb_table[radix - 2];
-    }
-    cur_limb = 0;
-    bf_resize(a, 1);
-    pos = 0;
-    has_decpt = false;
-    int_len = digit_count = 0;
-    for(;;) {
-        limb_t c;
-        if (*p == '.' && (p > p_start || bf_to_digit(p[1]) < radix)) {
-            if (has_decpt)
-                break;
-            has_decpt = true;
-            int_len = digit_count;
-            p++;
-        }
-        c = bf_to_digit(*p);
-        if (c >= radix)
-            break;
-        digit_count++;
-        p++;
-        if (radix_bits) {
-            shift -= radix_bits;
-            if (shift <= 0) {
-                cur_limb |= c >> (-shift);
-                if (bf_add_limb(a, &pos, cur_limb))
-                    goto mem_error;
-                if (shift < 0)
-                    cur_limb = c << (LIMB_BITS + shift);
-                else
-                    cur_limb = 0;
-                shift += LIMB_BITS;
-            } else {
-                cur_limb |= c << shift;
-            }
-        } else {
-            cur_limb = cur_limb * radix + c;
-            shift--;
-            if (shift == 0) {
-                if (bf_add_limb(a, &pos, cur_limb))
-                    goto mem_error;
-                shift = digits_per_limb;
-                cur_limb = 0;
-            }
-        }
-    }
-    if (!has_decpt)
-        int_len = digit_count;
-
-    /* add the last limb and pad with zeros */
-    if (shift != digits_per_limb) {
-        if (radix_bits == 0) {
-            while (shift != 0) {
-                cur_limb *= radix;
-                shift--;
-            }
-        }
-        if (bf_add_limb(a, &pos, cur_limb)) {
-        mem_error:
-            ret = BF_ST_MEM_ERROR;
-            if (!radix_bits)
-                bf_delete(a);
-            bf_set_nan(r);
-            goto done;
-        }
-    }
-
-    /* reset the next limbs to zero (we prefer to reallocate in the
-       renormalization) */
-    memset(a->tab, 0, (pos + 1) * sizeof(limb_t));
-
-    if (p == p_start) {
-        ret = 0;
-        if (!radix_bits)
-            bf_delete(a);
-        bf_set_nan(r);
-        goto done;
-    }
-
-    /* parse the exponent, if any */
-    expn = 0;
-    is_bin_exp = false;
-    if (((radix == 10 && (*p == 'e' || *p == 'E')) ||
-         (radix != 10 && (*p == '@' ||
-                          (radix_bits && (*p == 'p' || *p == 'P'))))) &&
-        p > p_start) {
-        is_bin_exp = (*p == 'p' || *p == 'P');
-        p++;
-        exp_is_neg = 0;
-        if (*p == '+') {
-            p++;
-        } else if (*p == '-') {
-            exp_is_neg = 1;
-            p++;
-        }
-        for(;;) {
-            int c;
-            c = bf_to_digit(*p);
-            if (c >= 10)
-                break;
-            if (unlikely(expn > ((BF_RAW_EXP_MAX - 2 - 9) / 10))) {
-                /* exponent overflow */
-                if (exp_is_neg) {
-                    bf_set_zero(r, is_neg);
-                    ret = BF_ST_UNDERFLOW | BF_ST_INEXACT;
-                } else {
-                    bf_set_inf(r, is_neg);
-                    ret = BF_ST_OVERFLOW | BF_ST_INEXACT;
-                }
-                goto done;
-            }
-            p++;
-            expn = expn * 10 + c;
-        }
-        if (exp_is_neg)
-            expn = -expn;
-    }
-    if (is_dec) {
-        a->expn = expn + int_len;
-        a->sign = is_neg;
-        ret = bfdec_normalize_and_round((bfdec_t *)a, prec, flags);
-    } else if (radix_bits) {
-        /* XXX: may overflow */
-        if (!is_bin_exp)
-            expn *= radix_bits;
-        a->expn = expn + (int_len * radix_bits);
-        a->sign = is_neg;
-        ret = bf_normalize_and_round(a, prec, flags);
-    } else {
-        limb_t l;
-        pos++;
-        l = a->len - pos; /* number of limbs */
-        if (l == 0) {
-            bf_set_zero(r, is_neg);
-            ret = 0;
-        } else {
-            bf_t T_s, *T = &T_s;
-
-            expn -= l * digits_per_limb - int_len;
-            bf_init(r->ctx, T);
-            if (bf_integer_from_radix(T, a->tab + pos, l, radix)) {
-                bf_set_nan(r);
-                ret = BF_ST_MEM_ERROR;
-            } else {
-                T->sign = is_neg;
-                if (flags & BF_ATOF_EXPONENT) {
-                    /* return the exponent */
-                    *pexponent = expn;
-                    ret = bf_set(r, T);
-                } else {
-                    ret = bf_mul_pow_radix(r, T, radix, expn, prec, flags);
-                }
-            }
-            bf_delete(T);
-        }
-        bf_delete(a);
-    }
- done:
-    if (pnext)
-        *pnext = p;
-    return ret;
-}
-
-/*
-   Return (status, n, exp). 'status' is the floating point status. 'n'
-   is the parsed number.
-
-   If (flags & BF_ATOF_EXPONENT) and if the radix is not a power of
-   two, the parsed number is equal to r *
-   (*pexponent)^radix. Otherwise *pexponent = 0.
-*/
-int bf_atof2(bf_t *r, slimb_t *pexponent,
-             const char *str, const char **pnext, int radix,
-             limb_t prec, bf_flags_t flags)
-{
-    return bf_atof_internal(r, pexponent, str, pnext, radix, prec, flags,
-                            false);
-}
-
-int bf_atof(bf_t *r, const char *str, const char **pnext, int radix,
-            limb_t prec, bf_flags_t flags)
-{
-    slimb_t dummy_exp;
-    return bf_atof_internal(r, &dummy_exp, str, pnext, radix, prec, flags, false);
-}
-
-/* base conversion to radix */
-
-#if LIMB_BITS == 64
-#define RADIXL_10 UINT64_C(10000000000000000000)
-#else
-#define RADIXL_10 UINT64_C(1000000000)
-#endif
-
-static const uint32_t inv_log2_radix[BF_RADIX_MAX - 1][LIMB_BITS / 32 + 1] = {
-#if LIMB_BITS == 32
-{ 0x80000000, 0x00000000,},
-{ 0x50c24e60, 0xd4d4f4a7,},
-{ 0x40000000, 0x00000000,},
-{ 0x372068d2, 0x0a1ee5ca,},
-{ 0x3184648d, 0xb8153e7a,},
-{ 0x2d983275, 0x9d5369c4,},
-{ 0x2aaaaaaa, 0xaaaaaaab,},
-{ 0x28612730, 0x6a6a7a54,},
-{ 0x268826a1, 0x3ef3fde6,},
-{ 0x25001383, 0xbac8a744,},
-{ 0x23b46706, 0x82c0c709,},
-{ 0x229729f1, 0xb2c83ded,},
-{ 0x219e7ffd, 0xa5ad572b,},
-{ 0x20c33b88, 0xda7c29ab,},
-{ 0x20000000, 0x00000000,},
-{ 0x1f50b57e, 0xac5884b3,},
-{ 0x1eb22cc6, 0x8aa6e26f,},
-{ 0x1e21e118, 0x0c5daab2,},
-{ 0x1d9dcd21, 0x439834e4,},
-{ 0x1d244c78, 0x367a0d65,},
-{ 0x1cb40589, 0xac173e0c,},
-{ 0x1c4bd95b, 0xa8d72b0d,},
-{ 0x1bead768, 0x98f8ce4c,},
-{ 0x1b903469, 0x050f72e5,},
-{ 0x1b3b433f, 0x2eb06f15,},
-{ 0x1aeb6f75, 0x9c46fc38,},
-{ 0x1aa038eb, 0x0e3bfd17,},
-{ 0x1a593062, 0xb38d8c56,},
-{ 0x1a15f4c3, 0x2b95a2e6,},
-{ 0x19d630dc, 0xcc7ddef9,},
-{ 0x19999999, 0x9999999a,},
-{ 0x195fec80, 0x8a609431,},
-{ 0x1928ee7b, 0x0b4f22f9,},
-{ 0x18f46acf, 0x8c06e318,},
-{ 0x18c23246, 0xdc0a9f3d,},
-#else
-{ 0x80000000, 0x00000000, 0x00000000,},
-{ 0x50c24e60, 0xd4d4f4a7, 0x021f57bc,},
-{ 0x40000000, 0x00000000, 0x00000000,},
-{ 0x372068d2, 0x0a1ee5ca, 0x19ea911b,},
-{ 0x3184648d, 0xb8153e7a, 0x7fc2d2e1,},
-{ 0x2d983275, 0x9d5369c4, 0x4dec1661,},
-{ 0x2aaaaaaa, 0xaaaaaaaa, 0xaaaaaaab,},
-{ 0x28612730, 0x6a6a7a53, 0x810fabde,},
-{ 0x268826a1, 0x3ef3fde6, 0x23e2566b,},
-{ 0x25001383, 0xbac8a744, 0x385a3349,},
-{ 0x23b46706, 0x82c0c709, 0x3f891718,},
-{ 0x229729f1, 0xb2c83ded, 0x15fba800,},
-{ 0x219e7ffd, 0xa5ad572a, 0xe169744b,},
-{ 0x20c33b88, 0xda7c29aa, 0x9bddee52,},
-{ 0x20000000, 0x00000000, 0x00000000,},
-{ 0x1f50b57e, 0xac5884b3, 0x70e28eee,},
-{ 0x1eb22cc6, 0x8aa6e26f, 0x06d1a2a2,},
-{ 0x1e21e118, 0x0c5daab1, 0x81b4f4bf,},
-{ 0x1d9dcd21, 0x439834e3, 0x81667575,},
-{ 0x1d244c78, 0x367a0d64, 0xc8204d6d,},
-{ 0x1cb40589, 0xac173e0c, 0x3b7b16ba,},
-{ 0x1c4bd95b, 0xa8d72b0d, 0x5879f25a,},
-{ 0x1bead768, 0x98f8ce4c, 0x66cc2858,},
-{ 0x1b903469, 0x050f72e5, 0x0cf5488e,},
-{ 0x1b3b433f, 0x2eb06f14, 0x8c89719c,},
-{ 0x1aeb6f75, 0x9c46fc37, 0xab5fc7e9,},
-{ 0x1aa038eb, 0x0e3bfd17, 0x1bd62080,},
-{ 0x1a593062, 0xb38d8c56, 0x7998ab45,},
-{ 0x1a15f4c3, 0x2b95a2e6, 0x46aed6a0,},
-{ 0x19d630dc, 0xcc7ddef9, 0x5aadd61b,},
-{ 0x19999999, 0x99999999, 0x9999999a,},
-{ 0x195fec80, 0x8a609430, 0xe1106014,},
-{ 0x1928ee7b, 0x0b4f22f9, 0x5f69791d,},
-{ 0x18f46acf, 0x8c06e318, 0x4d2aeb2c,},
-{ 0x18c23246, 0xdc0a9f3d, 0x3fe16970,},
-#endif
-};
-
-static const limb_t log2_radix[BF_RADIX_MAX - 1] = {
-#if LIMB_BITS == 32
-0x20000000,
-0x32b80347,
-0x40000000,
-0x4a4d3c26,
-0x52b80347,
-0x59d5d9fd,
-0x60000000,
-0x6570068e,
-0x6a4d3c26,
-0x6eb3a9f0,
-0x72b80347,
-0x766a008e,
-0x79d5d9fd,
-0x7d053f6d,
-0x80000000,
-0x82cc7edf,
-0x8570068e,
-0x87ef05ae,
-0x8a4d3c26,
-0x8c8ddd45,
-0x8eb3a9f0,
-0x90c10501,
-0x92b80347,
-0x949a784c,
-0x966a008e,
-0x982809d6,
-0x99d5d9fd,
-0x9b74948f,
-0x9d053f6d,
-0x9e88c6b3,
-0xa0000000,
-0xa16bad37,
-0xa2cc7edf,
-0xa4231623,
-0xa570068e,
-#else
-0x2000000000000000,
-0x32b803473f7ad0f4,
-0x4000000000000000,
-0x4a4d3c25e68dc57f,
-0x52b803473f7ad0f4,
-0x59d5d9fd5010b366,
-0x6000000000000000,
-0x6570068e7ef5a1e8,
-0x6a4d3c25e68dc57f,
-0x6eb3a9f01975077f,
-0x72b803473f7ad0f4,
-0x766a008e4788cbcd,
-0x79d5d9fd5010b366,
-0x7d053f6d26089673,
-0x8000000000000000,
-0x82cc7edf592262d0,
-0x8570068e7ef5a1e8,
-0x87ef05ae409a0289,
-0x8a4d3c25e68dc57f,
-0x8c8ddd448f8b845a,
-0x8eb3a9f01975077f,
-0x90c10500d63aa659,
-0x92b803473f7ad0f4,
-0x949a784bcd1b8afe,
-0x966a008e4788cbcd,
-0x982809d5be7072dc,
-0x99d5d9fd5010b366,
-0x9b74948f5532da4b,
-0x9d053f6d26089673,
-0x9e88c6b3626a72aa,
-0xa000000000000000,
-0xa16bad3758efd873,
-0xa2cc7edf592262d0,
-0xa4231623369e78e6,
-0xa570068e7ef5a1e8,
-#endif
-};
-
-/* compute floor(a*b) or ceil(a*b) with b = log2(radix) or
-   b=1/log2(radix). For is_inv = 0, strict accuracy is not guaranteed
-   when radix is not a power of two. */
-slimb_t bf_mul_log2_radix(slimb_t a1, unsigned int radix, int is_inv,
-                          int is_ceil1)
-{
-    int is_neg;
-    limb_t a;
-    bool is_ceil;
-
-    is_ceil = is_ceil1;
-    a = a1;
-    if (a1 < 0) {
-        a = -a;
-        is_neg = 1;
-    } else {
-        is_neg = 0;
-    }
-    is_ceil ^= is_neg;
-    if ((radix & (radix - 1)) == 0) {
-        int radix_bits;
-        /* radix is a power of two */
-        radix_bits = ceil_log2(radix);
-        if (is_inv) {
-            if (is_ceil)
-                a += radix_bits - 1;
-            a = a / radix_bits;
-        } else {
-            a = a * radix_bits;
-        }
-    } else {
-        const uint32_t *tab;
-        limb_t b0, b1;
-        dlimb_t t;
-
-        if (is_inv) {
-            tab = inv_log2_radix[radix - 2];
-#if LIMB_BITS == 32
-            b1 = tab[0];
-            b0 = tab[1];
-#else
-            b1 = ((limb_t)tab[0] << 32) | tab[1];
-            b0 = (limb_t)tab[2] << 32;
-#endif
-            t = (dlimb_t)b0 * (dlimb_t)a;
-            t = (dlimb_t)b1 * (dlimb_t)a + (t >> LIMB_BITS);
-            a = t >> (LIMB_BITS - 1);
-        } else {
-            b0 = log2_radix[radix - 2];
-            t = (dlimb_t)b0 * (dlimb_t)a;
-            a = t >> (LIMB_BITS - 3);
-        }
-        /* a = floor(result) and 'result' cannot be an integer */
-        a += is_ceil;
-    }
-    if (is_neg)
-        a = -a;
-    return a;
-}
-
-/* 'n' is the number of output limbs */
-static int bf_integer_to_radix_rec(bf_t *pow_tab,
-                                   limb_t *out, const bf_t *a, limb_t n,
-                                   int level, limb_t n0, limb_t radixl,
-                                   unsigned int radixl_bits)
-{
-    limb_t n1, n2, q_prec;
-    int ret;
-
-    assert(n >= 1);
-    if (n == 1) {
-        out[0] = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn);
-    } else if (n == 2) {
-        dlimb_t t;
-        slimb_t pos;
-        pos = a->len * LIMB_BITS - a->expn;
-        t = ((dlimb_t)get_bits(a->tab, a->len, pos + LIMB_BITS) << LIMB_BITS) |
-            get_bits(a->tab, a->len, pos);
-        if (likely(radixl == RADIXL_10)) {
-            /* use division by a constant when possible */
-            out[0] = t % RADIXL_10;
-            out[1] = t / RADIXL_10;
-        } else {
-            out[0] = t % radixl;
-            out[1] = t / radixl;
-        }
-    } else {
-        bf_t Q, R, *B, *B_inv;
-        int q_add;
-        bf_init(a->ctx, &Q);
-        bf_init(a->ctx, &R);
-        n2 = (((n0 * 2) >> (level + 1)) + 1) / 2;
-        n1 = n - n2;
-        B = &pow_tab[2 * level];
-        B_inv = &pow_tab[2 * level + 1];
-        ret = 0;
-        if (B->len == 0) {
-            /* compute BASE^n2 */
-            ret |= bf_pow_ui_ui(B, radixl, n2, BF_PREC_INF, BF_RNDZ);
-            /* we use enough bits for the maximum possible 'n1' value,
-               i.e. n2 + 1 */
-            ret |= bf_set_ui(&R, 1);
-            ret |= bf_div(B_inv, &R, B, (n2 + 1) * radixl_bits + 2, BF_RNDN);
-        }
-        //        printf("%d: n1=% " PRId64 " n2=%" PRId64 "\n", level, n1, n2);
-        q_prec = n1 * radixl_bits;
-        ret |= bf_mul(&Q, a, B_inv, q_prec, BF_RNDN);
-        ret |= bf_rint(&Q, BF_RNDZ);
-
-        ret |= bf_mul(&R, &Q, B, BF_PREC_INF, BF_RNDZ);
-        ret |= bf_sub(&R, a, &R, BF_PREC_INF, BF_RNDZ);
-
-        if (ret & BF_ST_MEM_ERROR)
-            goto fail;
-        /* adjust if necessary */
-        q_add = 0;
-        while (R.sign && R.len != 0) {
-            if (bf_add(&R, &R, B, BF_PREC_INF, BF_RNDZ))
-                goto fail;
-            q_add--;
-        }
-        while (bf_cmpu(&R, B) >= 0) {
-            if (bf_sub(&R, &R, B, BF_PREC_INF, BF_RNDZ))
-                goto fail;
-            q_add++;
-        }
-        if (q_add != 0) {
-            if (bf_add_si(&Q, &Q, q_add, BF_PREC_INF, BF_RNDZ))
-                goto fail;
-        }
-        if (bf_integer_to_radix_rec(pow_tab, out + n2, &Q, n1, level + 1, n0,
-                                    radixl, radixl_bits))
-            goto fail;
-        if (bf_integer_to_radix_rec(pow_tab, out, &R, n2, level + 1, n0,
-                                    radixl, radixl_bits)) {
-        fail:
-            bf_delete(&Q);
-            bf_delete(&R);
-            return -1;
-        }
-        bf_delete(&Q);
-        bf_delete(&R);
-    }
-    return 0;
-}
-
-/* return 0 if OK != 0 if memory error */
-static int bf_integer_to_radix(bf_t *r, const bf_t *a, limb_t radixl)
-{
-    bf_context_t *s = r->ctx;
-    limb_t r_len;
-    bf_t *pow_tab;
-    int i, pow_tab_len, ret;
-
-    r_len = r->len;
-    pow_tab_len = (ceil_log2(r_len) + 2) * 2; /* XXX: check */
-    pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len);
-    if (!pow_tab)
-        return -1;
-    for(i = 0; i < pow_tab_len; i++)
-        bf_init(r->ctx, &pow_tab[i]);
-
-    ret = bf_integer_to_radix_rec(pow_tab, r->tab, a, r_len, 0, r_len, radixl,
-                                  ceil_log2(radixl));
-
-    for(i = 0; i < pow_tab_len; i++) {
-        bf_delete(&pow_tab[i]);
-    }
-    bf_free(s, pow_tab);
-    return ret;
-}
-
-/* a must be >= 0. 'P' is the wanted number of digits in radix
-   'radix'. 'r' is the mantissa represented as an integer. *pE
-   contains the exponent. Return != 0 if memory error. */
-static int bf_convert_to_radix(bf_t *r, slimb_t *pE,
-                               const bf_t *a, int radix,
-                               limb_t P, bf_rnd_t rnd_mode,
-                               bool is_fixed_exponent)
-{
-    slimb_t E, e, prec, extra_bits, ziv_extra_bits, prec0;
-    bf_t B_s, *B = &B_s;
-    int e_sign, ret, res;
-
-    if (a->len == 0) {
-        /* zero case */
-        *pE = 0;
-        return bf_set(r, a);
-    }
-
-    if (is_fixed_exponent) {
-        E = *pE;
-    } else {
-        /* compute the new exponent */
-        E = 1 + bf_mul_log2_radix(a->expn - 1, radix, true, false);
-    }
-    //    bf_print_str("a", a);
-    //    printf("E=%ld P=%ld radix=%d\n", E, P, radix);
-
-    for(;;) {
-        e = P - E;
-        e_sign = 0;
-        if (e < 0) {
-            e = -e;
-            e_sign = 1;
-        }
-        /* Note: precision for log2(radix) is not critical here */
-        prec0 = bf_mul_log2_radix(P, radix, false, true);
-        ziv_extra_bits = 16;
-        for(;;) {
-            prec = prec0 + ziv_extra_bits;
-            /* XXX: rigorous error analysis needed */
-            extra_bits = ceil_log2(e) * 2 + 1;
-            ret = bf_pow_ui_ui(r, radix, e, prec + extra_bits,
-                               BF_RNDN | BF_FLAG_EXT_EXP);
-            if (!e_sign)
-                ret |= bf_mul(r, r, a, prec + extra_bits,
-                              BF_RNDN | BF_FLAG_EXT_EXP);
-            else
-                ret |= bf_div(r, a, r, prec + extra_bits,
-                              BF_RNDN | BF_FLAG_EXT_EXP);
-            if (ret & BF_ST_MEM_ERROR)
-                return BF_ST_MEM_ERROR;
-            /* if the result is not exact, check that it can be safely
-               rounded to an integer */
-            if ((ret & BF_ST_INEXACT) &&
-                !bf_can_round(r, r->expn, rnd_mode, prec)) {
-                /* and more precision and retry */
-                ziv_extra_bits = ziv_extra_bits  + (ziv_extra_bits / 2);
-                continue;
-            } else {
-                ret = bf_rint(r, rnd_mode);
-                if (ret & BF_ST_MEM_ERROR)
-                    return BF_ST_MEM_ERROR;
-                break;
-            }
-        }
-        if (is_fixed_exponent)
-            break;
-        /* check that the result is < B^P */
-        /* XXX: do a fast approximate test first ? */
-        bf_init(r->ctx, B);
-        ret = bf_pow_ui_ui(B, radix, P, BF_PREC_INF, BF_RNDZ);
-        if (ret) {
-            bf_delete(B);
-            return ret;
-        }
-        res = bf_cmpu(r, B);
-        bf_delete(B);
-        if (res < 0)
-            break;
-        /* try a larger exponent */
-        E++;
-    }
-    *pE = E;
-    return 0;
-}
-
-static void limb_to_a(char *buf, limb_t n, unsigned int radix, int len)
-{
-    int digit, i;
-
-    if (radix == 10) {
-        /* specific case with constant divisor */
-        for(i = len - 1; i >= 0; i--) {
-            digit = (limb_t)n % 10;
-            n = (limb_t)n / 10;
-            buf[i] = digit + '0';
-        }
-    } else {
-        for(i = len - 1; i >= 0; i--) {
-            digit = (limb_t)n % radix;
-            n = (limb_t)n / radix;
-            if (digit < 10)
-                digit += '0';
-            else
-                digit += 'a' - 10;
-            buf[i] = digit;
-        }
-    }
-}
-
-/* for power of 2 radixes */
-static void limb_to_a2(char *buf, limb_t n, unsigned int radix_bits, int len)
-{
-    int digit, i;
-    unsigned int mask;
-
-    mask = (1 << radix_bits) - 1;
-    for(i = len - 1; i >= 0; i--) {
-        digit = n & mask;
-        n >>= radix_bits;
-        if (digit < 10)
-            digit += '0';
-        else
-            digit += 'a' - 10;
-        buf[i] = digit;
-    }
-}
-
-/* 'a' must be an integer if the is_dec = false or if the radix is not
-   a power of two. A dot is added before the 'dot_pos' digit. dot_pos
-   = n_digits does not display the dot. 0 <= dot_pos <=
-   n_digits. n_digits >= 1. */
-static void output_digits(DynBuf *s, const bf_t *a1, int radix, limb_t n_digits,
-                          limb_t dot_pos, bool is_dec)
-{
-    limb_t i, v, l;
-    slimb_t pos, pos_incr;
-    int digits_per_limb, buf_pos, radix_bits, first_buf_pos;
-    char buf[65];
-    bf_t a_s, *a;
-
-    if (is_dec) {
-        digits_per_limb = LIMB_DIGITS;
-        a = (bf_t *)a1;
-        radix_bits = 0;
-        pos = a->len;
-        pos_incr = 1;
-        first_buf_pos = 0;
-    } else if ((radix & (radix - 1)) == 0) {
-        a = (bf_t *)a1;
-        radix_bits = ceil_log2(radix);
-        digits_per_limb = LIMB_BITS / radix_bits;
-        pos_incr = digits_per_limb * radix_bits;
-        /* digits are aligned relative to the radix point */
-        pos = a->len * LIMB_BITS + smod(-a->expn, radix_bits);
-        first_buf_pos = 0;
-    } else {
-        limb_t n, radixl;
-
-        digits_per_limb = digits_per_limb_table[radix - 2];
-        radixl = get_limb_radix(radix);
-        a = &a_s;
-        bf_init(a1->ctx, a);
-        n = (n_digits + digits_per_limb - 1) / digits_per_limb;
-        if (bf_resize(a, n)) {
-            dbuf_set_error(s);
-            goto done;
-        }
-        if (bf_integer_to_radix(a, a1, radixl)) {
-            dbuf_set_error(s);
-            goto done;
-        }
-        radix_bits = 0;
-        pos = n;
-        pos_incr = 1;
-        first_buf_pos = pos * digits_per_limb - n_digits;
-    }
-    buf_pos = digits_per_limb;
-    i = 0;
-    while (i < n_digits) {
-        if (buf_pos == digits_per_limb) {
-            pos -= pos_incr;
-            if (radix_bits == 0) {
-                v = get_limbz(a, pos);
-                limb_to_a(buf, v, radix, digits_per_limb);
-            } else {
-                v = get_bits(a->tab, a->len, pos);
-                limb_to_a2(buf, v, radix_bits, digits_per_limb);
-            }
-            buf_pos = first_buf_pos;
-            first_buf_pos = 0;
-        }
-        if (i < dot_pos) {
-            l = dot_pos;
-        } else {
-            if (i == dot_pos)
-                dbuf_putc(s, '.');
-            l = n_digits;
-        }
-        l = bf_min(digits_per_limb - buf_pos, l - i);
-        dbuf_put(s, (uint8_t *)(buf + buf_pos), l);
-        buf_pos += l;
-        i += l;
-    }
- done:
-    if (a != a1)
-        bf_delete(a);
-}
-
-static void *bf_dbuf_realloc(void *opaque, void *ptr, size_t size)
-{
-    bf_context_t *s = opaque;
-    return bf_realloc(s, ptr, size);
-}
-
-/* return the length in bytes. A trailing '\0' is added */
-static char *bf_ftoa_internal(size_t *plen, const bf_t *a2, int radix,
-                              limb_t prec, bf_flags_t flags, bool is_dec)
-{
-    bf_context_t *ctx = a2->ctx;
-    DynBuf s_s, *s = &s_s;
-    int radix_bits;
-
-    //    bf_print_str("ftoa", a2);
-    //    printf("radix=%d\n", radix);
-    dbuf_init2(s, ctx, bf_dbuf_realloc);
-    if (a2->expn == BF_EXP_NAN) {
-        dbuf_putstr(s, "NaN");
-    } else {
-        if (a2->sign)
-            dbuf_putc(s, '-');
-        if (a2->expn == BF_EXP_INF) {
-            if (flags & BF_FTOA_JS_QUIRKS)
-                dbuf_putstr(s, "Infinity");
-            else
-                dbuf_putstr(s, "Inf");
-        } else {
-            int fmt, ret;
-            slimb_t n_digits, n, i, n_max, n1;
-            bf_t a1_s, *a1 = &a1_s;
-
-            if ((radix & (radix - 1)) != 0)
-                radix_bits = 0;
-            else
-                radix_bits = ceil_log2(radix);
-
-            fmt = flags & BF_FTOA_FORMAT_MASK;
-            bf_init(ctx, a1);
-            if (fmt == BF_FTOA_FORMAT_FRAC) {
-                if (is_dec || radix_bits != 0) {
-                    if (bf_set(a1, a2))
-                        goto fail1;
-#ifdef USE_BF_DEC
-                    if (is_dec) {
-                        if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR)
-                            goto fail1;
-                        n = a1->expn;
-                    } else
-#endif
-                    {
-                        if (bf_round(a1, prec * radix_bits, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR)
-                            goto fail1;
-                        n = ceil_div(a1->expn, radix_bits);
-                    }
-                    if (flags & BF_FTOA_ADD_PREFIX) {
-                        if (radix == 16)
-                            dbuf_putstr(s, "0x");
-                        else if (radix == 8)
-                            dbuf_putstr(s, "0o");
-                        else if (radix == 2)
-                            dbuf_putstr(s, "0b");
-                    }
-                    if (a1->expn == BF_EXP_ZERO) {
-                        dbuf_putstr(s, "0");
-                        if (prec > 0) {
-                            dbuf_putstr(s, ".");
-                            for(i = 0; i < prec; i++) {
-                                dbuf_putc(s, '0');
-                            }
-                        }
-                    } else {
-                        n_digits = prec + n;
-                        if (n <= 0) {
-                            /* 0.x */
-                            dbuf_putstr(s, "0.");
-                            for(i = 0; i < -n; i++) {
-                                dbuf_putc(s, '0');
-                            }
-                            if (n_digits > 0) {
-                                output_digits(s, a1, radix, n_digits, n_digits, is_dec);
-                            }
-                        } else {
-                            output_digits(s, a1, radix, n_digits, n, is_dec);
-                        }
-                    }
-                } else {
-                    size_t pos, start;
-                    bf_t a_s, *a = &a_s;
-
-                    /* make a positive number */
-                    a->tab = a2->tab;
-                    a->len = a2->len;
-                    a->expn = a2->expn;
-                    a->sign = 0;
-
-                    /* one more digit for the rounding */
-                    n = 1 + bf_mul_log2_radix(bf_max(a->expn, 0), radix, true, true);
-                    n_digits = n + prec;
-                    n1 = n;
-                    if (bf_convert_to_radix(a1, &n1, a, radix, n_digits,
-                                            flags & BF_RND_MASK, true))
-                        goto fail1;
-                    start = s->size;
-                    output_digits(s, a1, radix, n_digits, n, is_dec);
-                    /* remove leading zeros because we allocated one more digit */
-                    pos = start;
-                    while ((pos + 1) < s->size && s->buf[pos] == '0' &&
-                           s->buf[pos + 1] != '.')
-                        pos++;
-                    if (pos > start) {
-                        memmove(s->buf + start, s->buf + pos, s->size - pos);
-                        s->size -= (pos - start);
-                    }
-                }
-            } else {
-#ifdef USE_BF_DEC
-                if (is_dec) {
-                    if (bf_set(a1, a2))
-                        goto fail1;
-                    if (fmt == BF_FTOA_FORMAT_FIXED) {
-                        n_digits = prec;
-                        n_max = n_digits;
-                        if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR)
-                            goto fail1;
-                    } else {
-                        /* prec is ignored */
-                        prec = n_digits = a1->len * LIMB_DIGITS;
-                        /* remove the trailing zero digits */
-                        while (n_digits > 1 &&
-                               get_digit(a1->tab, a1->len, prec - n_digits) == 0) {
-                            n_digits--;
-                        }
-                        n_max = n_digits + 4;
-                    }
-                    n = a1->expn;
-                } else
-#endif
-                if (radix_bits != 0) {
-                    if (bf_set(a1, a2))
-                        goto fail1;
-                    if (fmt == BF_FTOA_FORMAT_FIXED) {
-                        slimb_t prec_bits;
-                        n_digits = prec;
-                        n_max = n_digits;
-                        /* align to the radix point */
-                        prec_bits = prec * radix_bits -
-                            smod(-a1->expn, radix_bits);
-                        if (bf_round(a1, prec_bits,
-                                     (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR)
-                            goto fail1;
-                    } else {
-                        limb_t digit_mask;
-                        slimb_t pos;
-                        /* position of the digit before the most
-                           significant digit in bits */
-                        pos = a1->len * LIMB_BITS +
-                            smod(-a1->expn, radix_bits);
-                        n_digits = ceil_div(pos, radix_bits);
-                        /* remove the trailing zero digits */
-                        digit_mask = ((limb_t)1 << radix_bits) - 1;
-                        while (n_digits > 1 &&
-                               (get_bits(a1->tab, a1->len, pos - n_digits * radix_bits) & digit_mask) == 0) {
-                            n_digits--;
-                        }
-                        n_max = n_digits + 4;
-                    }
-                    n = ceil_div(a1->expn, radix_bits);
-                } else {
-                    bf_t a_s, *a = &a_s;
-
-                    /* make a positive number */
-                    a->tab = a2->tab;
-                    a->len = a2->len;
-                    a->expn = a2->expn;
-                    a->sign = 0;
-
-                    if (fmt == BF_FTOA_FORMAT_FIXED) {
-                        n_digits = prec;
-                        n_max = n_digits;
-                    } else {
-                        slimb_t n_digits_max, n_digits_min;
-
-                        assert(prec != BF_PREC_INF);
-                        n_digits = 1 + bf_mul_log2_radix(prec, radix, true, true);
-                        /* max number of digits for non exponential
-                           notation. The rational is to have the same rule
-                           as JS i.e. n_max = 21 for 64 bit float in base 10. */
-                        n_max = n_digits + 4;
-                        if (fmt == BF_FTOA_FORMAT_FREE_MIN) {
-                            bf_t b_s, *b = &b_s;
-
-                            /* find the minimum number of digits by
-                               dichotomy. */
-                            /* XXX: inefficient */
-                            n_digits_max = n_digits;
-                            n_digits_min = 1;
-                            bf_init(ctx, b);
-                            while (n_digits_min < n_digits_max) {
-                                n_digits = (n_digits_min + n_digits_max) / 2;
-                                if (bf_convert_to_radix(a1, &n, a, radix, n_digits,
-                                                        flags & BF_RND_MASK, false)) {
-                                    bf_delete(b);
-                                    goto fail1;
-                                }
-                                /* convert back to a number and compare */
-                                ret = bf_mul_pow_radix(b, a1, radix, n - n_digits,
-                                                       prec,
-                                                       (flags & ~BF_RND_MASK) |
-                                                       BF_RNDN);
-                                if (ret & BF_ST_MEM_ERROR) {
-                                    bf_delete(b);
-                                    goto fail1;
-                                }
-                                if (bf_cmpu(b, a) == 0) {
-                                    n_digits_max = n_digits;
-                                } else {
-                                    n_digits_min = n_digits + 1;
-                                }
-                            }
-                            bf_delete(b);
-                            n_digits = n_digits_max;
-                        }
-                    }
-                    if (bf_convert_to_radix(a1, &n, a, radix, n_digits,
-                                            flags & BF_RND_MASK, false)) {
-                    fail1:
-                        bf_delete(a1);
-                        goto fail;
-                    }
-                }
-                if (a1->expn == BF_EXP_ZERO &&
-                    fmt != BF_FTOA_FORMAT_FIXED &&
-                    !(flags & BF_FTOA_FORCE_EXP)) {
-                    /* just output zero */
-                    dbuf_putstr(s, "0");
-                } else {
-                    if (flags & BF_FTOA_ADD_PREFIX) {
-                        if (radix == 16)
-                            dbuf_putstr(s, "0x");
-                        else if (radix == 8)
-                            dbuf_putstr(s, "0o");
-                        else if (radix == 2)
-                            dbuf_putstr(s, "0b");
-                    }
-                    if (a1->expn == BF_EXP_ZERO)
-                        n = 1;
-                    if ((flags & BF_FTOA_FORCE_EXP) ||
-                        n <= -6 || n > n_max) {
-                        /* exponential notation */
-                        output_digits(s, a1, radix, n_digits, 1, is_dec);
-                        if (radix_bits != 0 && radix <= 16) {
-                            slimb_t exp_n = (n - 1) * radix_bits;
-                            if (flags & BF_FTOA_JS_QUIRKS)
-                                dbuf_printf(s, "p%+" PRId_LIMB, exp_n);
-                            else
-                                dbuf_printf(s, "p%" PRId_LIMB, exp_n);
-                        } else {
-                            const char c = radix <= 10 ? 'e' : '@';
-                            if (flags & BF_FTOA_JS_QUIRKS)
-                                dbuf_printf(s, "%c%+" PRId_LIMB, c, n - 1);
-                            else
-                                dbuf_printf(s, "%c%" PRId_LIMB, c, n - 1);
-                        }
-                    } else if (n <= 0) {
-                        /* 0.x */
-                        dbuf_putstr(s, "0.");
-                        for(i = 0; i < -n; i++) {
-                            dbuf_putc(s, '0');
-                        }
-                        output_digits(s, a1, radix, n_digits, n_digits, is_dec);
-                    } else {
-                        if (n_digits <= n) {
-                            /* no dot */
-                            output_digits(s, a1, radix, n_digits, n_digits, is_dec);
-                            for(i = 0; i < (n - n_digits); i++)
-                                dbuf_putc(s, '0');
-                        } else {
-                            output_digits(s, a1, radix, n_digits, n, is_dec);
-                        }
-                    }
-                }
-            }
-            bf_delete(a1);
-        }
-    }
-    dbuf_putc(s, '\0');
-    if (dbuf_error(s))
-        goto fail;
-    if (plen)
-        *plen = s->size - 1;
-    return (char *)s->buf;
- fail:
-    bf_free(ctx, s->buf);
-    if (plen)
-        *plen = 0;
-    return NULL;
-}
-
-char *bf_ftoa(size_t *plen, const bf_t *a, int radix, limb_t prec,
-              bf_flags_t flags)
-{
-    return bf_ftoa_internal(plen, a, radix, prec, flags, false);
-}
-
-/***************************************************************/
-/* transcendental functions */
-
-/* Note: the algorithm is from MPFR */
-static void bf_const_log2_rec(bf_t *T, bf_t *P, bf_t *Q, limb_t n1,
-                              limb_t n2, bool need_P)
-{
-    bf_context_t *s = T->ctx;
-    if ((n2 - n1) == 1) {
-        if (n1 == 0) {
-            bf_set_ui(P, 3);
-        } else {
-            bf_set_ui(P, n1);
-            P->sign = 1;
-        }
-        bf_set_ui(Q, 2 * n1 + 1);
-        Q->expn += 2;
-        bf_set(T, P);
-    } else {
-        limb_t m;
-        bf_t T1_s, *T1 = &T1_s;
-        bf_t P1_s, *P1 = &P1_s;
-        bf_t Q1_s, *Q1 = &Q1_s;
-
-        m = n1 + ((n2 - n1) >> 1);
-        bf_const_log2_rec(T, P, Q, n1, m, true);
-        bf_init(s, T1);
-        bf_init(s, P1);
-        bf_init(s, Q1);
-        bf_const_log2_rec(T1, P1, Q1, m, n2, need_P);
-        bf_mul(T, T, Q1, BF_PREC_INF, BF_RNDZ);
-        bf_mul(T1, T1, P, BF_PREC_INF, BF_RNDZ);
-        bf_add(T, T, T1, BF_PREC_INF, BF_RNDZ);
-        if (need_P)
-            bf_mul(P, P, P1, BF_PREC_INF, BF_RNDZ);
-        bf_mul(Q, Q, Q1, BF_PREC_INF, BF_RNDZ);
-        bf_delete(T1);
-        bf_delete(P1);
-        bf_delete(Q1);
-    }
-}
-
-/* compute log(2) with faithful rounding at precision 'prec' */
-static void bf_const_log2_internal(bf_t *T, limb_t prec)
-{
-    limb_t w, N;
-    bf_t P_s, *P = &P_s;
-    bf_t Q_s, *Q = &Q_s;
-
-    w = prec + 15;
-    N = w / 3 + 1;
-    bf_init(T->ctx, P);
-    bf_init(T->ctx, Q);
-    bf_const_log2_rec(T, P, Q, 0, N, false);
-    bf_div(T, T, Q, prec, BF_RNDN);
-    bf_delete(P);
-    bf_delete(Q);
-}
-
-/* PI constant */
-
-#define CHUD_A 13591409
-#define CHUD_B 545140134
-#define CHUD_C 640320
-#define CHUD_BITS_PER_TERM 47
-
-static void chud_bs(bf_t *P, bf_t *Q, bf_t *G, int64_t a, int64_t b, int need_g,
-                    limb_t prec)
-{
-    bf_context_t *s = P->ctx;
-    int64_t c;
-
-    if (a == (b - 1)) {
-        bf_t T0, T1;
-
-        bf_init(s, &T0);
-        bf_init(s, &T1);
-        bf_set_ui(G, 2 * b - 1);
-        bf_mul_ui(G, G, 6 * b - 1, prec, BF_RNDN);
-        bf_mul_ui(G, G, 6 * b - 5, prec, BF_RNDN);
-        bf_set_ui(&T0, CHUD_B);
-        bf_mul_ui(&T0, &T0, b, prec, BF_RNDN);
-        bf_set_ui(&T1, CHUD_A);
-        bf_add(&T0, &T0, &T1, prec, BF_RNDN);
-        bf_mul(P, G, &T0, prec, BF_RNDN);
-        P->sign = b & 1;
-
-        bf_set_ui(Q, b);
-        bf_mul_ui(Q, Q, b, prec, BF_RNDN);
-        bf_mul_ui(Q, Q, b, prec, BF_RNDN);
-        bf_mul_ui(Q, Q, (uint64_t)CHUD_C * CHUD_C * CHUD_C / 24, prec, BF_RNDN);
-        bf_delete(&T0);
-        bf_delete(&T1);
-    } else {
-        bf_t P2, Q2, G2;
-
-        bf_init(s, &P2);
-        bf_init(s, &Q2);
-        bf_init(s, &G2);
-
-        c = (a + b) / 2;
-        chud_bs(P, Q, G, a, c, 1, prec);
-        chud_bs(&P2, &Q2, &G2, c, b, need_g, prec);
-
-        /* Q = Q1 * Q2 */
-        /* G = G1 * G2 */
-        /* P = P1 * Q2 + P2 * G1 */
-        bf_mul(&P2, &P2, G, prec, BF_RNDN);
-        if (!need_g)
-            bf_set_ui(G, 0);
-        bf_mul(P, P, &Q2, prec, BF_RNDN);
-        bf_add(P, P, &P2, prec, BF_RNDN);
-        bf_delete(&P2);
-
-        bf_mul(Q, Q, &Q2, prec, BF_RNDN);
-        bf_delete(&Q2);
-        if (need_g)
-            bf_mul(G, G, &G2, prec, BF_RNDN);
-        bf_delete(&G2);
-    }
-}
-
-/* compute Pi with faithful rounding at precision 'prec' using the
-   Chudnovsky formula */
-static void bf_const_pi_internal(bf_t *Q, limb_t prec)
-{
-    bf_context_t *s = Q->ctx;
-    int64_t n, prec1;
-    bf_t P, G;
-
-    /* number of serie terms */
-    n = prec / CHUD_BITS_PER_TERM + 1;
-    /* XXX: precision analysis */
-    prec1 = prec + 32;
-
-    bf_init(s, &P);
-    bf_init(s, &G);
-
-    chud_bs(&P, Q, &G, 0, n, 0, BF_PREC_INF);
-
-    bf_mul_ui(&G, Q, CHUD_A, prec1, BF_RNDN);
-    bf_add(&P, &G, &P, prec1, BF_RNDN);
-    bf_div(Q, Q, &P, prec1, BF_RNDF);
-
-    bf_set_ui(&P, CHUD_C);
-    bf_sqrt(&G, &P, prec1, BF_RNDF);
-    bf_mul_ui(&G, &G, (uint64_t)CHUD_C / 12, prec1, BF_RNDF);
-    bf_mul(Q, Q, &G, prec, BF_RNDN);
-    bf_delete(&P);
-    bf_delete(&G);
-}
-
-static int bf_const_get(bf_t *T, limb_t prec, bf_flags_t flags,
-                        BFConstCache *c,
-                        void (*func)(bf_t *res, limb_t prec), int sign)
-{
-    limb_t ziv_extra_bits, prec1;
-
-    ziv_extra_bits = 32;
-    for(;;) {
-        prec1 = prec + ziv_extra_bits;
-        if (c->prec < prec1) {
-            if (c->val.len == 0)
-                bf_init(T->ctx, &c->val);
-            func(&c->val, prec1);
-            c->prec = prec1;
-        } else {
-            prec1 = c->prec;
-        }
-        bf_set(T, &c->val);
-        T->sign = sign;
-        if (!bf_can_round(T, prec, flags & BF_RND_MASK, prec1)) {
-            /* and more precision and retry */
-            ziv_extra_bits = ziv_extra_bits  + (ziv_extra_bits / 2);
-        } else {
-            break;
-        }
-    }
-    return bf_round(T, prec, flags);
-}
-
-static void bf_const_free(BFConstCache *c)
-{
-    bf_delete(&c->val);
-    memset(c, 0, sizeof(*c));
-}
-
-int bf_const_log2(bf_t *T, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = T->ctx;
-    return bf_const_get(T, prec, flags, &s->log2_cache, bf_const_log2_internal, 0);
-}
-
-/* return rounded pi * (1 - 2 * sign) */
-static int bf_const_pi_signed(bf_t *T, int sign, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = T->ctx;
-    return bf_const_get(T, prec, flags, &s->pi_cache, bf_const_pi_internal,
-                        sign);
-}
-
-int bf_const_pi(bf_t *T, limb_t prec, bf_flags_t flags)
-{
-    return bf_const_pi_signed(T, 0, prec, flags);
-}
-
-void bf_clear_cache(bf_context_t *s)
-{
-#ifdef USE_FFT_MUL
-    fft_clear_cache(s);
-#endif
-    bf_const_free(&s->log2_cache);
-    bf_const_free(&s->pi_cache);
-}
-
-/* ZivFunc should compute the result 'r' with faithful rounding at
-   precision 'prec'. For efficiency purposes, the final bf_round()
-   does not need to be done in the function. */
-typedef int ZivFunc(bf_t *r, const bf_t *a, limb_t prec, void *opaque);
-
-static int bf_ziv_rounding(bf_t *r, const bf_t *a,
-                           limb_t prec, bf_flags_t flags,
-                           ZivFunc *f, void *opaque)
-{
-    int rnd_mode, ret;
-    slimb_t prec1, ziv_extra_bits;
-
-    rnd_mode = flags & BF_RND_MASK;
-    if (rnd_mode == BF_RNDF) {
-        /* no need to iterate */
-        f(r, a, prec, opaque);
-        ret = 0;
-    } else {
-        ziv_extra_bits = 32;
-        for(;;) {
-            prec1 = prec + ziv_extra_bits;
-            ret = f(r, a, prec1, opaque);
-            if (ret & (BF_ST_OVERFLOW | BF_ST_UNDERFLOW | BF_ST_MEM_ERROR)) {
-                /* overflow or underflow should never happen because
-                   it indicates the rounding cannot be done correctly,
-                   but we do not catch all the cases */
-                return ret;
-            }
-            /* if the result is exact, we can stop */
-            if (!(ret & BF_ST_INEXACT)) {
-                ret = 0;
-                break;
-            }
-            if (bf_can_round(r, prec, rnd_mode, prec1)) {
-                ret = BF_ST_INEXACT;
-                break;
-            }
-            ziv_extra_bits = ziv_extra_bits * 2;
-            //            printf("ziv_extra_bits=%" PRId64 "\n", (int64_t)ziv_extra_bits);
-        }
-    }
-    if (r->len == 0)
-        return ret;
-    else
-        return __bf_round(r, prec, flags, r->len, ret);
-}
-
-/* add (1 - 2*e_sign) * 2^e */
-static int bf_add_epsilon(bf_t *r, const bf_t *a, slimb_t e, int e_sign,
-                          limb_t prec, int flags)
-{
-    bf_t T_s, *T = &T_s;
-    int ret;
-    /* small argument case: result = 1 + epsilon * sign(x) */
-    bf_init(a->ctx, T);
-    bf_set_ui(T, 1);
-    T->sign = e_sign;
-    T->expn += e;
-    ret = bf_add(r, r, T, prec, flags);
-    bf_delete(T);
-    return ret;
-}
-
-/* Compute the exponential using faithful rounding at precision 'prec'.
-   Note: the algorithm is from MPFR */
-static int bf_exp_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    slimb_t n, K, l, i, prec1;
-
-    assert(r != a);
-
-    /* argument reduction:
-       T = a - n*log(2) with 0 <= T < log(2) and n integer.
-    */
-    bf_init(s, T);
-    if (a->expn <= -1) {
-        /* 0 <= abs(a) <= 0.5 */
-        if (a->sign)
-            n = -1;
-        else
-            n = 0;
-    } else {
-        bf_const_log2(T, LIMB_BITS, BF_RNDZ);
-        bf_div(T, a, T, LIMB_BITS, BF_RNDD);
-        bf_get_limb(&n, T, 0);
-    }
-
-    K = bf_isqrt((prec + 1) / 2);
-    l = (prec - 1) / K + 1;
-    /* XXX: precision analysis ? */
-    prec1 = prec + (K + 2 * l + 18) + K + 8;
-    if (a->expn > 0)
-        prec1 += a->expn;
-    //    printf("n=%ld K=%ld prec1=%ld\n", n, K, prec1);
-
-    bf_const_log2(T, prec1, BF_RNDF);
-    bf_mul_si(T, T, n, prec1, BF_RNDN);
-    bf_sub(T, a, T, prec1, BF_RNDN);
-
-    /* reduce the range of T */
-    bf_mul_2exp(T, -K, BF_PREC_INF, BF_RNDZ);
-
-    /* Taylor expansion around zero :
-     1 + x + x^2/2 + ... + x^n/n!
-     = (1 + x * (1 + x/2 * (1 + ... (x/n))))
-    */
-    {
-        bf_t U_s, *U = &U_s;
-
-        bf_init(s, U);
-        bf_set_ui(r, 1);
-        for(i = l ; i >= 1; i--) {
-            bf_set_ui(U, i);
-            bf_div(U, T, U, prec1, BF_RNDN);
-            bf_mul(r, r, U, prec1, BF_RNDN);
-            bf_add_si(r, r, 1, prec1, BF_RNDN);
-        }
-        bf_delete(U);
-    }
-    bf_delete(T);
-
-    /* undo the range reduction */
-    for(i = 0; i < K; i++) {
-        bf_mul(r, r, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP);
-    }
-
-    /* undo the argument reduction */
-    bf_mul_2exp(r, n, BF_PREC_INF, BF_RNDZ | BF_FLAG_EXT_EXP);
-
-    return BF_ST_INEXACT;
-}
-
-/* crude overflow and underflow tests for exp(a). a_low <= a <= a_high */
-static int check_exp_underflow_overflow(bf_context_t *s, bf_t *r,
-                                        const bf_t *a_low, const bf_t *a_high,
-                                        limb_t prec, bf_flags_t flags)
-{
-    bf_t T_s, *T = &T_s;
-    bf_t log2_s, *log2 = &log2_s;
-    slimb_t e_min, e_max;
-
-    if (a_high->expn <= 0)
-        return 0;
-
-    e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
-    e_min = -e_max + 3;
-    if (flags & BF_FLAG_SUBNORMAL)
-        e_min -= (prec - 1);
-
-    bf_init(s, T);
-    bf_init(s, log2);
-    bf_const_log2(log2, LIMB_BITS, BF_RNDU);
-    bf_mul_ui(T, log2, e_max, LIMB_BITS, BF_RNDU);
-    /* a_low > e_max * log(2) implies exp(a) > e_max */
-    if (bf_cmp_lt(T, a_low) > 0) {
-        /* overflow */
-        bf_delete(T);
-        bf_delete(log2);
-        return bf_set_overflow(r, 0, prec, flags);
-    }
-    /* a_high < (e_min - 2) * log(2) implies exp(a) < (e_min - 2) */
-    bf_const_log2(log2, LIMB_BITS, BF_RNDD);
-    bf_mul_si(T, log2, e_min - 2, LIMB_BITS, BF_RNDD);
-    if (bf_cmp_lt(a_high, T)) {
-        int rnd_mode = flags & BF_RND_MASK;
-
-        /* underflow */
-        bf_delete(T);
-        bf_delete(log2);
-        if (rnd_mode == BF_RNDU) {
-            /* set the smallest value */
-            bf_set_ui(r, 1);
-            r->expn = e_min;
-        } else {
-            bf_set_zero(r, 0);
-        }
-        return BF_ST_UNDERFLOW | BF_ST_INEXACT;
-    }
-    bf_delete(log2);
-    bf_delete(T);
-    return 0;
-}
-
-int bf_exp(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = r->ctx;
-    int ret;
-    assert(r != a);
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-        } else if (a->expn == BF_EXP_INF) {
-            if (a->sign)
-                bf_set_zero(r, 0);
-            else
-                bf_set_inf(r, 0);
-        } else {
-            bf_set_ui(r, 1);
-        }
-        return 0;
-    }
-
-    ret = check_exp_underflow_overflow(s, r, a, a, prec, flags);
-    if (ret)
-        return ret;
-    if (a->expn < 0 && (-a->expn) >= (prec + 2)) {
-        /* small argument case: result = 1 + epsilon * sign(x) */
-        bf_set_ui(r, 1);
-        return bf_add_epsilon(r, r, -(prec + 2), a->sign, prec, flags);
-    }
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_exp_internal, NULL);
-}
-
-static int bf_log_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    bf_t U_s, *U = &U_s;
-    bf_t V_s, *V = &V_s;
-    slimb_t n, prec1, l, i, K;
-
-    assert(r != a);
-
-    bf_init(s, T);
-    /* argument reduction 1 */
-    /* T=a*2^n with 2/3 <= T <= 4/3 */
-    {
-        bf_t U_s, *U = &U_s;
-        bf_set(T, a);
-        n = T->expn;
-        T->expn = 0;
-        /* U= ~ 2/3 */
-        bf_init(s, U);
-        bf_set_ui(U, 0xaaaaaaaa);
-        U->expn = 0;
-        if (bf_cmp_lt(T, U)) {
-            T->expn++;
-            n--;
-        }
-        bf_delete(U);
-    }
-    //    printf("n=%ld\n", n);
-    //    bf_print_str("T", T);
-
-    /* XXX: precision analysis */
-    /* number of iterations for argument reduction 2 */
-    K = bf_isqrt((prec + 1) / 2);
-    /* order of Taylor expansion */
-    l = prec / (2 * K) + 1;
-    /* precision of the intermediate computations */
-    prec1 = prec + K + 2 * l + 32;
-
-    bf_init(s, U);
-    bf_init(s, V);
-
-    /* Note: cancellation occurs here, so we use more precision (XXX:
-       reduce the precision by computing the exact cancellation) */
-    bf_add_si(T, T, -1, BF_PREC_INF, BF_RNDN);
-
-    /* argument reduction 2 */
-    for(i = 0; i < K; i++) {
-        /* T = T / (1 + sqrt(1 + T)) */
-        bf_add_si(U, T, 1, prec1, BF_RNDN);
-        bf_sqrt(V, U, prec1, BF_RNDF);
-        bf_add_si(U, V, 1, prec1, BF_RNDN);
-        bf_div(T, T, U, prec1, BF_RNDN);
-    }
-
-    {
-        bf_t Y_s, *Y = &Y_s;
-        bf_t Y2_s, *Y2 = &Y2_s;
-        bf_init(s, Y);
-        bf_init(s, Y2);
-
-        /* compute ln(1+x) = ln((1+y)/(1-y)) with y=x/(2+x)
-           = y + y^3/3 + ... + y^(2*l + 1) / (2*l+1)
-           with Y=Y^2
-           = y*(1+Y/3+Y^2/5+...) = y*(1+Y*(1/3+Y*(1/5 + ...)))
-        */
-        bf_add_si(Y, T, 2, prec1, BF_RNDN);
-        bf_div(Y, T, Y, prec1, BF_RNDN);
-
-        bf_mul(Y2, Y, Y, prec1, BF_RNDN);
-        bf_set_ui(r, 0);
-        for(i = l; i >= 1; i--) {
-            bf_set_ui(U, 1);
-            bf_set_ui(V, 2 * i + 1);
-            bf_div(U, U, V, prec1, BF_RNDN);
-            bf_add(r, r, U, prec1, BF_RNDN);
-            bf_mul(r, r, Y2, prec1, BF_RNDN);
-        }
-        bf_add_si(r, r, 1, prec1, BF_RNDN);
-        bf_mul(r, r, Y, prec1, BF_RNDN);
-        bf_delete(Y);
-        bf_delete(Y2);
-    }
-    bf_delete(V);
-    bf_delete(U);
-
-    /* multiplication by 2 for the Taylor expansion and undo the
-       argument reduction 2*/
-    bf_mul_2exp(r, K + 1, BF_PREC_INF, BF_RNDZ);
-
-    /* undo the argument reduction 1 */
-    bf_const_log2(T, prec1, BF_RNDF);
-    bf_mul_si(T, T, n, prec1, BF_RNDN);
-    bf_add(r, r, T, prec1, BF_RNDN);
-
-    bf_delete(T);
-    return BF_ST_INEXACT;
-}
-
-int bf_log(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-
-    assert(r != a);
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF) {
-            if (a->sign) {
-                bf_set_nan(r);
-                return BF_ST_INVALID_OP;
-            } else {
-                bf_set_inf(r, 0);
-                return 0;
-            }
-        } else {
-            bf_set_inf(r, 1);
-            return 0;
-        }
-    }
-    if (a->sign) {
-        bf_set_nan(r);
-        return BF_ST_INVALID_OP;
-    }
-    bf_init(s, T);
-    bf_set_ui(T, 1);
-    if (bf_cmp_eq(a, T)) {
-        bf_set_zero(r, 0);
-        bf_delete(T);
-        return 0;
-    }
-    bf_delete(T);
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_log_internal, NULL);
-}
-
-/* x and y finite and x > 0 */
-static int bf_pow_generic(bf_t *r, const bf_t *x, limb_t prec, void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    const bf_t *y = opaque;
-    bf_t T_s, *T = &T_s;
-    limb_t prec1;
-
-    bf_init(s, T);
-    /* XXX: proof for the added precision */
-    prec1 = prec + 32;
-    bf_log(T, x, prec1, BF_RNDF | BF_FLAG_EXT_EXP);
-    bf_mul(T, T, y, prec1, BF_RNDF | BF_FLAG_EXT_EXP);
-    if (bf_is_nan(T))
-        bf_set_nan(r);
-    else
-        bf_exp_internal(r, T, prec1, NULL); /* no overflow/underlow test needed */
-    bf_delete(T);
-    return BF_ST_INEXACT;
-}
-
-/* x and y finite, x > 0, y integer and y fits on one limb */
-static int bf_pow_int(bf_t *r, const bf_t *x, limb_t prec, void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    const bf_t *y = opaque;
-    bf_t T_s, *T = &T_s;
-    limb_t prec1;
-    int ret;
-    slimb_t y1;
-
-    bf_get_limb(&y1, y, 0);
-    if (y1 < 0)
-        y1 = -y1;
-    /* XXX: proof for the added precision */
-    prec1 = prec + ceil_log2(y1) * 2 + 8;
-    ret = bf_pow_ui(r, x, y1 < 0 ? -y1 : y1, prec1, BF_RNDN | BF_FLAG_EXT_EXP);
-    if (y->sign) {
-        bf_init(s, T);
-        bf_set_ui(T, 1);
-        ret |= bf_div(r, T, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP);
-        bf_delete(T);
-    }
-    return ret;
-}
-
-/* x must be a finite non zero float. Return true if there is a
-   floating point number r such as x=r^(2^n) and return this floating
-   point number 'r'. Otherwise return false and r is undefined. */
-static bool check_exact_power2n(bf_t *r, const bf_t *x, slimb_t n)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    slimb_t e, i, er;
-    limb_t v;
-
-    /* x = m*2^e with m odd integer */
-    e = bf_get_exp_min(x);
-    /* fast check on the exponent */
-    if (n > (LIMB_BITS - 1)) {
-        if (e != 0)
-            return false;
-        er = 0;
-    } else {
-        if ((e & (((limb_t)1 << n) - 1)) != 0)
-            return false;
-        er = e >> n;
-    }
-    /* every perfect odd square = 1 modulo 8 */
-    v = get_bits(x->tab, x->len, x->len * LIMB_BITS - x->expn + e);
-    if ((v & 7) != 1)
-        return false;
-
-    bf_init(s, T);
-    bf_set(T, x);
-    T->expn -= e;
-    for(i = 0; i < n; i++) {
-        if (i != 0)
-            bf_set(T, r);
-        if (bf_sqrtrem(r, NULL, T) != 0)
-            return false;
-    }
-    r->expn += er;
-    return true;
-}
-
-/* prec = BF_PREC_INF is accepted for x and y integers and y >= 0 */
-int bf_pow(bf_t *r, const bf_t *x, const bf_t *y, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    bf_t ytmp_s;
-    bool y_is_int, y_is_odd;
-    int r_sign, ret, rnd_mode;
-    slimb_t y_emin;
-
-    if (x->len == 0 || y->len == 0) {
-        if (y->expn == BF_EXP_ZERO) {
-            /* pow(x, 0) = 1 */
-            bf_set_ui(r, 1);
-        } else if (x->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-        } else {
-            int cmp_x_abs_1;
-            bf_set_ui(r, 1);
-            cmp_x_abs_1 = bf_cmpu(x, r);
-            if (cmp_x_abs_1 == 0 && (flags & BF_POW_JS_QUIRKS) &&
-                (y->expn >= BF_EXP_INF)) {
-                bf_set_nan(r);
-            } else if (cmp_x_abs_1 == 0 &&
-                       (!x->sign || y->expn != BF_EXP_NAN)) {
-                /* pow(1, y) = 1 even if y = NaN */
-                /* pow(-1, +/-inf) = 1 */
-            } else if (y->expn == BF_EXP_NAN) {
-                bf_set_nan(r);
-            } else if (y->expn == BF_EXP_INF) {
-                if (y->sign == (cmp_x_abs_1 > 0)) {
-                    bf_set_zero(r, 0);
-                } else {
-                    bf_set_inf(r, 0);
-                }
-            } else {
-                y_emin = bf_get_exp_min(y);
-                y_is_odd = (y_emin == 0);
-                if (y->sign == (x->expn == BF_EXP_ZERO)) {
-                    bf_set_inf(r, y_is_odd & x->sign);
-                    if (y->sign) {
-                        /* pow(0, y) with y < 0 */
-                        return BF_ST_DIVIDE_ZERO;
-                    }
-                } else {
-                    bf_set_zero(r, y_is_odd & x->sign);
-                }
-            }
-        }
-        return 0;
-    }
-    bf_init(s, T);
-    bf_set(T, x);
-    y_emin = bf_get_exp_min(y);
-    y_is_int = (y_emin >= 0);
-    rnd_mode = flags & BF_RND_MASK;
-    if (x->sign) {
-        if (!y_is_int) {
-            bf_set_nan(r);
-            bf_delete(T);
-            return BF_ST_INVALID_OP;
-        }
-        y_is_odd = (y_emin == 0);
-        r_sign = y_is_odd;
-        /* change the directed rounding mode if the sign of the result
-           is changed */
-        if (r_sign && (rnd_mode == BF_RNDD || rnd_mode == BF_RNDU))
-            flags ^= 1;
-        bf_neg(T);
-    } else {
-        r_sign = 0;
-    }
-
-    bf_set_ui(r, 1);
-    if (bf_cmp_eq(T, r)) {
-        /* abs(x) = 1: nothing more to do */
-        ret = 0;
-    } else {
-        /* check the overflow/underflow cases */
-        {
-            bf_t al_s, *al = &al_s;
-            bf_t ah_s, *ah = &ah_s;
-            limb_t precl = LIMB_BITS;
-
-            bf_init(s, al);
-            bf_init(s, ah);
-            /* compute bounds of log(abs(x)) * y with a low precision */
-            /* XXX: compute bf_log() once */
-            /* XXX: add a fast test before this slow test */
-            bf_log(al, T, precl, BF_RNDD);
-            bf_log(ah, T, precl, BF_RNDU);
-            bf_mul(al, al, y, precl, BF_RNDD ^ y->sign);
-            bf_mul(ah, ah, y, precl, BF_RNDU ^ y->sign);
-            ret = check_exp_underflow_overflow(s, r, al, ah, prec, flags);
-            bf_delete(al);
-            bf_delete(ah);
-            if (ret)
-                goto done;
-        }
-
-        if (y_is_int) {
-            slimb_t T_bits, e;
-        int_pow:
-            T_bits = T->expn - bf_get_exp_min(T);
-            if (T_bits == 1) {
-                /* pow(2^b, y) = 2^(b*y) */
-                bf_mul_si(T, y, T->expn - 1, LIMB_BITS, BF_RNDZ);
-                bf_get_limb(&e, T, 0);
-                bf_set_ui(r, 1);
-                ret = bf_mul_2exp(r, e, prec, flags);
-            } else if (prec == BF_PREC_INF) {
-                slimb_t y1;
-                /* specific case for infinite precision (integer case) */
-                bf_get_limb(&y1, y, 0);
-                assert(!y->sign);
-                /* x must be an integer, so abs(x) >= 2 */
-                if (y1 >= ((slimb_t)1 << BF_EXP_BITS_MAX)) {
-                    bf_delete(T);
-                    return bf_set_overflow(r, 0, BF_PREC_INF, flags);
-                }
-                ret = bf_pow_ui(r, T, y1, BF_PREC_INF, BF_RNDZ);
-            } else {
-                if (y->expn <= 31) {
-                    /* small enough power: use exponentiation in all cases */
-                } else if (y->sign) {
-                    /* cannot be exact */
-                    goto general_case;
-                } else {
-                    if (rnd_mode == BF_RNDF)
-                        goto general_case; /* no need to track exact results */
-                    /* see if the result has a chance to be exact:
-                       if x=a*2^b (a odd), x^y=a^y*2^(b*y)
-                       x^y needs a precision of at least floor_log2(a)*y bits
-                    */
-                    bf_mul_si(r, y, T_bits - 1, LIMB_BITS, BF_RNDZ);
-                    bf_get_limb(&e, r, 0);
-                    if (prec < e)
-                        goto general_case;
-                }
-                ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_int, (void *)y);
-            }
-        } else {
-            if (rnd_mode != BF_RNDF) {
-                bf_t *y1;
-                if (y_emin < 0 && check_exact_power2n(r, T, -y_emin)) {
-                    /* the problem is reduced to a power to an integer */
-                    bf_set(T, r);
-                    y1 = &ytmp_s;
-                    y1->tab = y->tab;
-                    y1->len = y->len;
-                    y1->sign = y->sign;
-                    y1->expn = y->expn - y_emin;
-                    y = y1;
-                    goto int_pow;
-                }
-            }
-        general_case:
-            ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_generic, (void *)y);
-        }
-    }
- done:
-    bf_delete(T);
-    r->sign = r_sign;
-    return ret;
-}
-
-/* compute sqrt(-2*x-x^2) to get |sin(x)| from cos(x) - 1. */
-static void bf_sqrt_sin(bf_t *r, const bf_t *x, limb_t prec1)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    bf_init(s, T);
-    bf_set(T, x);
-    bf_mul(r, T, T, prec1, BF_RNDN);
-    bf_mul_2exp(T, 1, BF_PREC_INF, BF_RNDZ);
-    bf_add(T, T, r, prec1, BF_RNDN);
-    bf_neg(T);
-    bf_sqrt(r, T, prec1, BF_RNDF);
-    bf_delete(T);
-}
-
-static int bf_sincos(bf_t *s, bf_t *c, const bf_t *a, limb_t prec)
-{
-    bf_context_t *s1 = a->ctx;
-    bf_t T_s, *T = &T_s;
-    bf_t U_s, *U = &U_s;
-    bf_t r_s, *r = &r_s;
-    slimb_t K, prec1, i, l, mod, prec2;
-    int is_neg;
-
-    assert(c != a && s != a);
-
-    bf_init(s1, T);
-    bf_init(s1, U);
-    bf_init(s1, r);
-
-    /* XXX: precision analysis */
-    K = bf_isqrt(prec / 2);
-    l = prec / (2 * K) + 1;
-    prec1 = prec + 2 * K + l + 8;
-
-    /* after the modulo reduction, -pi/4 <= T <= pi/4 */
-    if (a->expn <= -1) {
-        /* abs(a) <= 0.25: no modulo reduction needed */
-        bf_set(T, a);
-        mod = 0;
-    } else {
-        slimb_t cancel;
-        cancel = 0;
-        for(;;) {
-            prec2 = prec1 + a->expn + cancel;
-            bf_const_pi(U, prec2, BF_RNDF);
-            bf_mul_2exp(U, -1, BF_PREC_INF, BF_RNDZ);
-            bf_remquo(&mod, T, a, U, prec2, BF_RNDN, BF_RNDN);
-            //            printf("T.expn=%ld prec2=%ld\n", T->expn, prec2);
-            if (mod == 0 || (T->expn != BF_EXP_ZERO &&
-                             (T->expn + prec2) >= (prec1 - 1)))
-                break;
-            /* increase the number of bits until the precision is good enough */
-            cancel = bf_max(-T->expn, (cancel + 1) * 3 / 2);
-        }
-        mod &= 3;
-    }
-
-    is_neg = T->sign;
-
-    /* compute cosm1(x) = cos(x) - 1 */
-    bf_mul(T, T, T, prec1, BF_RNDN);
-    bf_mul_2exp(T, -2 * K, BF_PREC_INF, BF_RNDZ);
-
-    /* Taylor expansion:
-       -x^2/2 + x^4/4! - x^6/6! + ...
-    */
-    bf_set_ui(r, 1);
-    for(i = l ; i >= 1; i--) {
-        bf_set_ui(U, 2 * i - 1);
-        bf_mul_ui(U, U, 2 * i, BF_PREC_INF, BF_RNDZ);
-        bf_div(U, T, U, prec1, BF_RNDN);
-        bf_mul(r, r, U, prec1, BF_RNDN);
-        bf_neg(r);
-        if (i != 1)
-            bf_add_si(r, r, 1, prec1, BF_RNDN);
-    }
-    bf_delete(U);
-
-    /* undo argument reduction:
-       cosm1(2*x)= 2*(2*cosm1(x)+cosm1(x)^2)
-    */
-    for(i = 0; i < K; i++) {
-        bf_mul(T, r, r, prec1, BF_RNDN);
-        bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ);
-        bf_add(r, r, T, prec1, BF_RNDN);
-        bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ);
-    }
-    bf_delete(T);
-
-    if (c) {
-        if ((mod & 1) == 0) {
-            bf_add_si(c, r, 1, prec1, BF_RNDN);
-        } else {
-            bf_sqrt_sin(c, r, prec1);
-            c->sign = is_neg ^ 1;
-        }
-        c->sign ^= mod >> 1;
-    }
-    if (s) {
-        if ((mod & 1) == 0) {
-            bf_sqrt_sin(s, r, prec1);
-            s->sign = is_neg;
-        } else {
-            bf_add_si(s, r, 1, prec1, BF_RNDN);
-        }
-        s->sign ^= mod >> 1;
-    }
-    bf_delete(r);
-    return BF_ST_INEXACT;
-}
-
-static int bf_cos_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
-{
-    return bf_sincos(NULL, r, a, prec);
-}
-
-int bf_cos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bf_set_ui(r, 1);
-            return 0;
-        }
-    }
-
-    /* small argument case: result = 1+r(x) with r(x) = -x^2/2 +
-       O(X^4). We assume r(x) < 2^(2*EXP(x) - 1). */
-    if (a->expn < 0) {
-        slimb_t e;
-        e = 2 * a->expn - 1;
-        if (e < -(prec + 2)) {
-            bf_set_ui(r, 1);
-            return bf_add_epsilon(r, r, e, 1, prec, flags);
-        }
-    }
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_cos_internal, NULL);
-}
-
-static int bf_sin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
-{
-    return bf_sincos(r, NULL, a, prec);
-}
-
-int bf_sin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bf_set_zero(r, a->sign);
-            return 0;
-        }
-    }
-
-    /* small argument case: result = x+r(x) with r(x) = -x^3/6 +
-       O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */
-    if (a->expn < 0) {
-        slimb_t e;
-        e = sat_add(2 * a->expn, a->expn - 2);
-        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
-            bf_set(r, a);
-            return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags);
-        }
-    }
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_sin_internal, NULL);
-}
-
-static int bf_tan_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    limb_t prec1;
-
-    /* XXX: precision analysis */
-    prec1 = prec + 8;
-    bf_init(s, T);
-    bf_sincos(r, T, a, prec1);
-    bf_div(r, r, T, prec1, BF_RNDF);
-    bf_delete(T);
-    return BF_ST_INEXACT;
-}
-
-int bf_tan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    assert(r != a);
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bf_set_zero(r, a->sign);
-            return 0;
-        }
-    }
-
-    /* small argument case: result = x+r(x) with r(x) = x^3/3 +
-       O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */
-    if (a->expn < 0) {
-        slimb_t e;
-        e = sat_add(2 * a->expn, a->expn - 1);
-        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
-            bf_set(r, a);
-            return bf_add_epsilon(r, r, e, a->sign, prec, flags);
-        }
-    }
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_tan_internal, NULL);
-}
-
-/* if add_pi2 is true, add pi/2 to the result (used for acos(x) to
-   avoid cancellation) */
-static int bf_atan_internal(bf_t *r, const bf_t *a, limb_t prec,
-                            void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    bool add_pi2 = (bool)(intptr_t)opaque;
-    bf_t T_s, *T = &T_s;
-    bf_t U_s, *U = &U_s;
-    bf_t V_s, *V = &V_s;
-    bf_t X2_s, *X2 = &X2_s;
-    int cmp_1;
-    slimb_t prec1, i, K, l;
-
-    /* XXX: precision analysis */
-    K = bf_isqrt((prec + 1) / 2);
-    l = prec / (2 * K) + 1;
-    prec1 = prec + K + 2 * l + 32;
-    //    printf("prec=%d K=%d l=%d prec1=%d\n", (int)prec, (int)K, (int)l, (int)prec1);
-
-    bf_init(s, T);
-    cmp_1 = (a->expn >= 1); /* a >= 1 */
-    if (cmp_1) {
-        bf_set_ui(T, 1);
-        bf_div(T, T, a, prec1, BF_RNDN);
-    } else {
-        bf_set(T, a);
-    }
-
-    /* abs(T) <= 1 */
-
-    /* argument reduction */
-
-    bf_init(s, U);
-    bf_init(s, V);
-    bf_init(s, X2);
-    for(i = 0; i < K; i++) {
-        /* T = T / (1 + sqrt(1 + T^2)) */
-        bf_mul(U, T, T, prec1, BF_RNDN);
-        bf_add_si(U, U, 1, prec1, BF_RNDN);
-        bf_sqrt(V, U, prec1, BF_RNDN);
-        bf_add_si(V, V, 1, prec1, BF_RNDN);
-        bf_div(T, T, V, prec1, BF_RNDN);
-    }
-
-    /* Taylor series:
-       x - x^3/3 + ... + (-1)^ l * y^(2*l + 1) / (2*l+1)
-    */
-    bf_mul(X2, T, T, prec1, BF_RNDN);
-    bf_set_ui(r, 0);
-    for(i = l; i >= 1; i--) {
-        bf_set_si(U, 1);
-        bf_set_ui(V, 2 * i + 1);
-        bf_div(U, U, V, prec1, BF_RNDN);
-        bf_neg(r);
-        bf_add(r, r, U, prec1, BF_RNDN);
-        bf_mul(r, r, X2, prec1, BF_RNDN);
-    }
-    bf_neg(r);
-    bf_add_si(r, r, 1, prec1, BF_RNDN);
-    bf_mul(r, r, T, prec1, BF_RNDN);
-
-    /* undo the argument reduction */
-    bf_mul_2exp(r, K, BF_PREC_INF, BF_RNDZ);
-
-    bf_delete(U);
-    bf_delete(V);
-    bf_delete(X2);
-
-    i = add_pi2;
-    if (cmp_1 > 0) {
-        /* undo the inversion : r = sign(a)*PI/2 - r */
-        bf_neg(r);
-        i += 1 - 2 * a->sign;
-    }
-    /* add i*(pi/2) with -1 <= i <= 2 */
-    if (i != 0) {
-        bf_const_pi(T, prec1, BF_RNDF);
-        if (i != 2)
-            bf_mul_2exp(T, -1, BF_PREC_INF, BF_RNDZ);
-        T->sign = (i < 0);
-        bf_add(r, T, r, prec1, BF_RNDN);
-    }
-
-    bf_delete(T);
-    return BF_ST_INEXACT;
-}
-
-int bf_atan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    int res;
-
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF)  {
-            /* -PI/2 or PI/2 */
-            bf_const_pi_signed(r, a->sign, prec, flags);
-            bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ);
-            return BF_ST_INEXACT;
-        } else {
-            bf_set_zero(r, a->sign);
-            return 0;
-        }
-    }
-
-    bf_init(s, T);
-    bf_set_ui(T, 1);
-    res = bf_cmpu(a, T);
-    bf_delete(T);
-    if (res == 0) {
-        /* short cut: abs(a) == 1 -> +/-pi/4 */
-        bf_const_pi_signed(r, a->sign, prec, flags);
-        bf_mul_2exp(r, -2, BF_PREC_INF, BF_RNDZ);
-        return BF_ST_INEXACT;
-    }
-
-    /* small argument case: result = x+r(x) with r(x) = -x^3/3 +
-       O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */
-    if (a->expn < 0) {
-        slimb_t e;
-        e = sat_add(2 * a->expn, a->expn - 1);
-        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
-            bf_set(r, a);
-            return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags);
-        }
-    }
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_atan_internal, (void *)false);
-}
-
-static int bf_atan2_internal(bf_t *r, const bf_t *y, limb_t prec, void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    const bf_t *x = opaque;
-    bf_t T_s, *T = &T_s;
-    limb_t prec1;
-    int ret;
-
-    if (y->expn == BF_EXP_NAN || x->expn == BF_EXP_NAN) {
-        bf_set_nan(r);
-        return 0;
-    }
-
-    /* compute atan(y/x) assumming inf/inf = 1 and 0/0 = 0 */
-    bf_init(s, T);
-    prec1 = prec + 32;
-    if (y->expn == BF_EXP_INF && x->expn == BF_EXP_INF) {
-        bf_set_ui(T, 1);
-        T->sign = y->sign ^ x->sign;
-    } else if (y->expn == BF_EXP_ZERO && x->expn == BF_EXP_ZERO) {
-        bf_set_zero(T, y->sign ^ x->sign);
-    } else {
-        bf_div(T, y, x, prec1, BF_RNDF);
-    }
-    ret = bf_atan(r, T, prec1, BF_RNDF);
-
-    if (x->sign) {
-        /* if x < 0 (it includes -0), return sign(y)*pi + atan(y/x) */
-        bf_const_pi(T, prec1, BF_RNDF);
-        T->sign = y->sign;
-        bf_add(r, r, T, prec1, BF_RNDN);
-        ret |= BF_ST_INEXACT;
-    }
-
-    bf_delete(T);
-    return ret;
-}
-
-int bf_atan2(bf_t *r, const bf_t *y, const bf_t *x,
-             limb_t prec, bf_flags_t flags)
-{
-    return bf_ziv_rounding(r, y, prec, flags, bf_atan2_internal, (void *)x);
-}
-
-static int bf_asin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque)
-{
-    bf_context_t *s = r->ctx;
-    bool is_acos = (bool)(intptr_t)opaque;
-    bf_t T_s, *T = &T_s;
-    limb_t prec1, prec2;
-
-    /* asin(x) = atan(x/sqrt(1-x^2))
-       acos(x) = pi/2 - asin(x) */
-    prec1 = prec + 8;
-    /* increase the precision in x^2 to compensate the cancellation in
-       (1-x^2) if x is close to 1 */
-    /* XXX: use less precision when possible */
-    if (a->expn >= 0)
-        prec2 = BF_PREC_INF;
-    else
-        prec2 = prec1;
-    bf_init(s, T);
-    bf_mul(T, a, a, prec2, BF_RNDN);
-    bf_neg(T);
-    bf_add_si(T, T, 1, prec2, BF_RNDN);
-
-    bf_sqrt(r, T, prec1, BF_RNDN);
-    bf_div(T, a, r, prec1, BF_RNDN);
-    if (is_acos)
-        bf_neg(T);
-    bf_atan_internal(r, T, prec1, (void *)(intptr_t)is_acos);
-    bf_delete(T);
-    return BF_ST_INEXACT;
-}
-
-int bf_asin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    int res;
-
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bf_set_zero(r, a->sign);
-            return 0;
-        }
-    }
-    bf_init(s, T);
-    bf_set_ui(T, 1);
-    res = bf_cmpu(a, T);
-    bf_delete(T);
-    if (res > 0) {
-        bf_set_nan(r);
-        return BF_ST_INVALID_OP;
-    }
-
-    /* small argument case: result = x+r(x) with r(x) = x^3/6 +
-       O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */
-    if (a->expn < 0) {
-        slimb_t e;
-        e = sat_add(2 * a->expn, a->expn - 2);
-        if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) {
-            bf_set(r, a);
-            return bf_add_epsilon(r, r, e, a->sign, prec, flags);
-        }
-    }
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)false);
-}
-
-int bf_acos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = r->ctx;
-    bf_t T_s, *T = &T_s;
-    int res;
-
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bf_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF) {
-            bf_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bf_const_pi(r, prec, flags);
-            bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ);
-            return BF_ST_INEXACT;
-        }
-    }
-    bf_init(s, T);
-    bf_set_ui(T, 1);
-    res = bf_cmpu(a, T);
-    bf_delete(T);
-    if (res > 0) {
-        bf_set_nan(r);
-        return BF_ST_INVALID_OP;
-    } else if (res == 0 && a->sign == 0) {
-        bf_set_zero(r, 0);
-        return 0;
-    }
-
-    return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)true);
-}
-
-/***************************************************************/
-/* decimal floating point numbers */
-
-#ifdef USE_BF_DEC
-
-#define adddq(r1, r0, a1, a0)                   \
-    do {                                        \
-        limb_t __t = r0;                        \
-        r0 += (a0);                             \
-        r1 += (a1) + (r0 < __t);                \
-    } while (0)
-
-#define subdq(r1, r0, a1, a0)                   \
-    do {                                        \
-        limb_t __t = r0;                        \
-        r0 -= (a0);                             \
-        r1 -= (a1) + (r0 > __t);                \
-    } while (0)
-
-#if LIMB_BITS == 64
-
-/* Note: we assume __int128 is available */
-/* uint128_t defined in libbf.h          */
-#define muldq(r1, r0, a, b)                     \
-    do {                                        \
-        uint128_t __t;                          \
-        __t = (uint128_t)(a) * (uint128_t)(b);  \
-        r0 = __t;                               \
-        r1 = __t >> 64;                         \
-    } while (0)
-
-#define divdq(q, r, a1, a0, b)                  \
-    do {                                        \
-        uint128_t __t;                  \
-        limb_t __b = (b);                       \
-        __t = ((uint128_t)(a1) << 64) | (a0);   \
-        q = __t / __b;                                  \
-        r = __t % __b;                                  \
-    } while (0)
-
-#else
-
-#define muldq(r1, r0, a, b)                     \
-    do {                                        \
-        uint64_t __t;                          \
-        __t = (uint64_t)(a) * (uint64_t)(b);  \
-        r0 = __t;                               \
-        r1 = __t >> 32;                         \
-    } while (0)
-
-#define divdq(q, r, a1, a0, b)                  \
-    do {                                        \
-        uint64_t __t;                  \
-        limb_t __b = (b);                       \
-        __t = ((uint64_t)(a1) << 32) | (a0);   \
-        q = __t / __b;                                  \
-        r = __t % __b;                                  \
-    } while (0)
-
-#endif /* LIMB_BITS != 64 */
-
-#if LIMB_DIGITS == 19
-
-/* WARNING: hardcoded for b = 1e19. It is assumed that:
-   0 <= a1 < 2^63 */
-#define divdq_base(q, r, a1, a0)\
-do {\
-    uint64_t __a0, __a1, __t0, __t1, __b = BF_DEC_BASE; \
-    __a0 = a0;\
-    __a1 = a1;\
-    __t0 = __a1;\
-    __t0 = shld(__t0, __a0, 1);\
-    muldq(q, __t1, __t0, UINT64_C(17014118346046923173)); \
-    muldq(__t1, __t0, q, __b);\
-    subdq(__a1, __a0, __t1, __t0);\
-    subdq(__a1, __a0, 1, __b * 2);    \
-    __t0 = (slimb_t)__a1 >> 1; \
-    q += 2 + __t0;\
-    adddq(__a1, __a0, 0, __b & __t0);\
-    q += __a1;                  \
-    __a0 += __b & __a1;           \
-    r = __a0;\
-} while(0)
-
-#elif LIMB_DIGITS == 9
-
-/* WARNING: hardcoded for b = 1e9. It is assumed that:
-   0 <= a1 < 2^29 */
-#define divdq_base(q, r, a1, a0)\
-do {\
-    uint32_t __t0, __t1, __b = BF_DEC_BASE; \
-    __t0 = a1;\
-    __t1 = a0;\
-    __t0 = (__t0 << 3) | (__t1 >> (32 - 3));    \
-    muldq(q, __t1, __t0, 2305843009U);\
-    r = a0 - q * __b;\
-    __t1 = (r >= __b);\
-    q += __t1;\
-    if (__t1)\
-        r -= __b;\
-} while(0)
-
-#endif
-
-/* fast integer division by a fixed constant */
-
-typedef struct FastDivData {
-    limb_t m1; /* multiplier */
-    int8_t shift1;
-    int8_t shift2;
-} FastDivData;
-
-/* From "Division by Invariant Integers using Multiplication" by
-   Torborn Granlund and Peter L. Montgomery */
-/* d must be != 0 */
-static inline __maybe_unused void fast_udiv_init(FastDivData *s, limb_t d)
-{
-    int l;
-    limb_t q, r, m1;
-    if (d == 1)
-        l = 0;
-    else
-        l = 64 - clz64(d - 1);
-    divdq(q, r, ((limb_t)1 << l) - d, 0, d);
-    (void)r;
-    m1 = q + 1;
-    //    printf("d=%lu l=%d m1=0x%016lx\n", d, l, m1);
-    s->m1 = m1;
-    s->shift1 = l;
-    if (s->shift1 > 1)
-        s->shift1 = 1;
-    s->shift2 = l - 1;
-    if (s->shift2 < 0)
-        s->shift2 = 0;
-}
-
-static inline limb_t fast_udiv(limb_t a, const FastDivData *s)
-{
-    limb_t t0, t1;
-    muldq(t1, t0, s->m1, a);
-    t0 = (a - t1) >> s->shift1;
-    return (t1 + t0) >> s->shift2;
-}
-
-/* contains 10^i */
-const limb_t mp_pow_dec[LIMB_DIGITS + 1] = {
-    1U,
-    10U,
-    100U,
-    1000U,
-    10000U,
-    100000U,
-    1000000U,
-    10000000U,
-    100000000U,
-    1000000000U,
-#if LIMB_BITS == 64
-    10000000000U,
-    100000000000U,
-    1000000000000U,
-    10000000000000U,
-    100000000000000U,
-    1000000000000000U,
-    10000000000000000U,
-    100000000000000000U,
-    1000000000000000000U,
-    10000000000000000000U,
-#endif
-};
-
-/* precomputed from fast_udiv_init(10^i) */
-static const FastDivData mp_pow_div[LIMB_DIGITS + 1] = {
-#if LIMB_BITS == 32
-    { 0x00000001, 0, 0 },
-    { 0x9999999a, 1, 3 },
-    { 0x47ae147b, 1, 6 },
-    { 0x0624dd30, 1, 9 },
-    { 0xa36e2eb2, 1, 13 },
-    { 0x4f8b588f, 1, 16 },
-    { 0x0c6f7a0c, 1, 19 },
-    { 0xad7f29ac, 1, 23 },
-    { 0x5798ee24, 1, 26 },
-    { 0x12e0be83, 1, 29 },
-#else
-    { 0x0000000000000001, 0, 0 },
-    { 0x999999999999999a, 1, 3 },
-    { 0x47ae147ae147ae15, 1, 6 },
-    { 0x0624dd2f1a9fbe77, 1, 9 },
-    { 0xa36e2eb1c432ca58, 1, 13 },
-    { 0x4f8b588e368f0847, 1, 16 },
-    { 0x0c6f7a0b5ed8d36c, 1, 19 },
-    { 0xad7f29abcaf48579, 1, 23 },
-    { 0x5798ee2308c39dfa, 1, 26 },
-    { 0x12e0be826d694b2f, 1, 29 },
-    { 0xb7cdfd9d7bdbab7e, 1, 33 },
-    { 0x5fd7fe17964955fe, 1, 36 },
-    { 0x19799812dea11198, 1, 39 },
-    { 0xc25c268497681c27, 1, 43 },
-    { 0x6849b86a12b9b01f, 1, 46 },
-    { 0x203af9ee756159b3, 1, 49 },
-    { 0xcd2b297d889bc2b7, 1, 53 },
-    { 0x70ef54646d496893, 1, 56 },
-    { 0x2725dd1d243aba0f, 1, 59 },
-    { 0xd83c94fb6d2ac34d, 1, 63 },
-#endif
-};
-
-/* divide by 10^shift with 0 <= shift <= LIMB_DIGITS */
-static inline limb_t fast_shr_dec(limb_t a, int shift)
-{
-    return fast_udiv(a, &mp_pow_div[shift]);
-}
-
-/* division and remainder by 10^shift */
-#define fast_shr_rem_dec(q, r, a, shift) q = fast_shr_dec(a, shift), r = a - q * mp_pow_dec[shift]
-
-limb_t mp_add_dec(limb_t *res, const limb_t *op1, const limb_t *op2,
-                  mp_size_t n, limb_t carry)
-{
-    limb_t base = BF_DEC_BASE;
-    mp_size_t i;
-    limb_t k, a, v;
-
-    k=carry;
-    for(i=0;i<n;i++) {
-        /* XXX: reuse the trick in add_mod */
-        v = op1[i];
-        a = v + op2[i] + k - base;
-        k = a <= v;
-        if (!k)
-            a += base;
-        res[i]=a;
-    }
-    return k;
-}
-
-limb_t mp_add_ui_dec(limb_t *tab, limb_t b, mp_size_t n)
-{
-    limb_t base = BF_DEC_BASE;
-    mp_size_t i;
-    limb_t k, a, v;
-
-    k=b;
-    for(i=0;i<n;i++) {
-        v = tab[i];
-        a = v + k - base;
-        k = a <= v;
-        if (!k)
-            a += base;
-        tab[i] = a;
-        if (k == 0)
-            break;
-    }
-    return k;
-}
-
-limb_t mp_sub_dec(limb_t *res, const limb_t *op1, const limb_t *op2,
-                  mp_size_t n, limb_t carry)
-{
-    limb_t base = BF_DEC_BASE;
-    mp_size_t i;
-    limb_t k, v, a;
-
-    k=carry;
-    for(i=0;i<n;i++) {
-        v = op1[i];
-        a = v - op2[i] - k;
-        k = a > v;
-        if (k)
-            a += base;
-        res[i] = a;
-    }
-    return k;
-}
-
-limb_t mp_sub_ui_dec(limb_t *tab, limb_t b, mp_size_t n)
-{
-    limb_t base = BF_DEC_BASE;
-    mp_size_t i;
-    limb_t k, v, a;
-
-    k=b;
-    for(i=0;i<n;i++) {
-        v = tab[i];
-        a = v - k;
-        k = a > v;
-        if (k)
-            a += base;
-        tab[i]=a;
-        if (k == 0)
-            break;
-    }
-    return k;
-}
-
-/* taba[] = taba[] * b + l. 0 <= b, l <= base - 1. Return the high carry */
-limb_t mp_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n,
-                   limb_t b, limb_t l)
-{
-    mp_size_t i;
-    limb_t t0, t1, r;
-
-    for(i = 0; i < n; i++) {
-        muldq(t1, t0, taba[i], b);
-        adddq(t1, t0, 0, l);
-        divdq_base(l, r, t1, t0);
-        tabr[i] = r;
-    }
-    return l;
-}
-
-/* tabr[] += taba[] * b. 0 <= b <= base - 1. Return the value to add
-   to the high word */
-limb_t mp_add_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n,
-                       limb_t b)
-{
-    mp_size_t i;
-    limb_t l, t0, t1, r;
-
-    l = 0;
-    for(i = 0; i < n; i++) {
-        muldq(t1, t0, taba[i], b);
-        adddq(t1, t0, 0, l);
-        adddq(t1, t0, 0, tabr[i]);
-        divdq_base(l, r, t1, t0);
-        tabr[i] = r;
-    }
-    return l;
-}
-
-/* tabr[] -= taba[] * b. 0 <= b <= base - 1. Return the value to
-   substract to the high word. */
-limb_t mp_sub_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n,
-                       limb_t b)
-{
-    limb_t base = BF_DEC_BASE;
-    mp_size_t i;
-    limb_t l, t0, t1, r, a, v, c;
-
-    /* XXX: optimize */
-    l = 0;
-    for(i = 0; i < n; i++) {
-        muldq(t1, t0, taba[i], b);
-        adddq(t1, t0, 0, l);
-        divdq_base(l, r, t1, t0);
-        v = tabr[i];
-        a = v - r;
-        c = a > v;
-        if (c)
-            a += base;
-        /* never bigger than base because r = 0 when l = base - 1 */
-        l += c;
-        tabr[i] = a;
-    }
-    return l;
-}
-
-/* size of the result : op1_size + op2_size. */
-void mp_mul_basecase_dec(limb_t *result,
-                         const limb_t *op1, mp_size_t op1_size,
-                         const limb_t *op2, mp_size_t op2_size)
-{
-    mp_size_t i;
-    limb_t r;
-
-    result[op1_size] = mp_mul1_dec(result, op1, op1_size, op2[0], 0);
-
-    for(i=1;i<op2_size;i++) {
-        r = mp_add_mul1_dec(result + i, op1, op1_size, op2[i]);
-        result[i + op1_size] = r;
-    }
-}
-
-/* taba[] = (taba[] + r*base^na) / b. 0 <= b < base. 0 <= r <
-   b. Return the remainder. */
-limb_t mp_div1_dec(limb_t *tabr, const limb_t *taba, mp_size_t na,
-                   limb_t b, limb_t r)
-{
-    limb_t base = BF_DEC_BASE;
-    mp_size_t i;
-    limb_t t0, t1, q;
-    int shift;
-
-#if (BF_DEC_BASE % 2) == 0
-    if (b == 2) {
-        limb_t base_div2;
-        /* Note: only works if base is even */
-        base_div2 = base >> 1;
-        if (r)
-            r = base_div2;
-        for(i = na - 1; i >= 0; i--) {
-            t0 = taba[i];
-            tabr[i] = (t0 >> 1) + r;
-            r = 0;
-            if (t0 & 1)
-                r = base_div2;
-        }
-        if (r)
-            r = 1;
-    } else
-#endif
-    if (na >= UDIV1NORM_THRESHOLD) {
-        shift = clz(b);
-        if (shift == 0) {
-            /* normalized case: b >= 2^(LIMB_BITS-1) */
-            limb_t b_inv;
-            b_inv = udiv1norm_init(b);
-            for(i = na - 1; i >= 0; i--) {
-                muldq(t1, t0, r, base);
-                adddq(t1, t0, 0, taba[i]);
-                q = udiv1norm(&r, t1, t0, b, b_inv);
-                tabr[i] = q;
-            }
-        } else {
-            limb_t b_inv;
-            b <<= shift;
-            b_inv = udiv1norm_init(b);
-            for(i = na - 1; i >= 0; i--) {
-                muldq(t1, t0, r, base);
-                adddq(t1, t0, 0, taba[i]);
-                t1 = (t1 << shift) | (t0 >> (LIMB_BITS - shift));
-                t0 <<= shift;
-                q = udiv1norm(&r, t1, t0, b, b_inv);
-                r >>= shift;
-                tabr[i] = q;
-            }
-        }
-    } else {
-        for(i = na - 1; i >= 0; i--) {
-            muldq(t1, t0, r, base);
-            adddq(t1, t0, 0, taba[i]);
-            divdq(q, r, t1, t0, b);
-            tabr[i] = q;
-        }
-    }
-    return r;
-}
-
-static __maybe_unused void mp_print_str_dec(const char *str,
-                                       const limb_t *tab, slimb_t n)
-{
-    slimb_t i;
-    printf("%s=", str);
-    for(i = n - 1; i >= 0; i--) {
-        if (i != n - 1)
-            printf("_");
-        printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]);
-    }
-    printf("\n");
-}
-
-static __maybe_unused void mp_print_str_h_dec(const char *str,
-                                              const limb_t *tab, slimb_t n,
-                                              limb_t high)
-{
-    slimb_t i;
-    printf("%s=", str);
-    printf("%0*" PRIu_LIMB, LIMB_DIGITS, high);
-    for(i = n - 1; i >= 0; i--) {
-        printf("_");
-        printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]);
-    }
-    printf("\n");
-}
-
-//#define DEBUG_DIV_SLOW
-
-#define DIV_STATIC_ALLOC_LEN 16
-
-/* return q = a / b and r = a % b.
-
-   taba[na] must be allocated if tabb1[nb - 1] < B / 2.  tabb1[nb - 1]
-   must be != zero. na must be >= nb. 's' can be NULL if tabb1[nb - 1]
-   >= B / 2.
-
-   The remainder is is returned in taba and contains nb libms. tabq
-   contains na - nb + 1 limbs. No overlap is permitted.
-
-   Running time of the standard method: (na - nb + 1) * nb
-   Return 0 if OK, -1 if memory alloc error
-*/
-/* XXX: optimize */
-static int mp_div_dec(bf_context_t *s, limb_t *tabq,
-                      limb_t *taba, mp_size_t na,
-                      const limb_t *tabb1, mp_size_t nb)
-{
-    limb_t base = BF_DEC_BASE;
-    limb_t r, mult, t0, t1, a, c, q, v, *tabb;
-    mp_size_t i, j;
-    limb_t static_tabb[DIV_STATIC_ALLOC_LEN];
-
-#ifdef DEBUG_DIV_SLOW
-    mp_print_str_dec("a", taba, na);
-    mp_print_str_dec("b", tabb1, nb);
-#endif
-
-    /* normalize tabb */
-    r = tabb1[nb - 1];
-    assert(r != 0);
-    i = na - nb;
-    if (r >= BF_DEC_BASE / 2) {
-        mult = 1;
-        tabb = (limb_t *)tabb1;
-        q = 1;
-        for(j = nb - 1; j >= 0; j--) {
-            if (taba[i + j] != tabb[j]) {
-                if (taba[i + j] < tabb[j])
-                    q = 0;
-                break;
-            }
-        }
-        tabq[i] = q;
-        if (q) {
-            mp_sub_dec(taba + i, taba + i, tabb, nb, 0);
-        }
-        i--;
-    } else {
-        mult = base / (r + 1);
-        if (likely(nb <= DIV_STATIC_ALLOC_LEN)) {
-            tabb = static_tabb;
-        } else {
-            tabb = bf_malloc(s, sizeof(limb_t) * nb);
-            if (!tabb)
-                return -1;
-        }
-        mp_mul1_dec(tabb, tabb1, nb, mult, 0);
-        taba[na] = mp_mul1_dec(taba, taba, na, mult, 0);
-    }
-
-#ifdef DEBUG_DIV_SLOW
-    printf("mult=" FMT_LIMB "\n", mult);
-    mp_print_str_dec("a_norm", taba, na + 1);
-    mp_print_str_dec("b_norm", tabb, nb);
-#endif
-
-    for(; i >= 0; i--) {
-        if (unlikely(taba[i + nb] >= tabb[nb - 1])) {
-            /* XXX: check if it is really possible */
-            q = base - 1;
-        } else {
-            muldq(t1, t0, taba[i + nb], base);
-            adddq(t1, t0, 0, taba[i + nb - 1]);
-            divdq(q, r, t1, t0, tabb[nb - 1]);
-        }
-        //        printf("i=%d q1=%ld\n", i, q);
-
-        r = mp_sub_mul1_dec(taba + i, tabb, nb, q);
-        //        mp_dump("r1", taba + i, nb, bd);
-        //        printf("r2=%ld\n", r);
-
-        v = taba[i + nb];
-        a = v - r;
-        c = a > v;
-        if (c)
-            a += base;
-        taba[i + nb] = a;
-
-        if (c != 0) {
-            /* negative result */
-            for(;;) {
-                q--;
-                c = mp_add_dec(taba + i, taba + i, tabb, nb, 0);
-                /* propagate carry and test if positive result */
-                if (c != 0) {
-                    if (++taba[i + nb] == base) {
-                        break;
-                    }
-                }
-            }
-        }
-        tabq[i] = q;
-    }
-
-#ifdef DEBUG_DIV_SLOW
-    mp_print_str_dec("q", tabq, na - nb + 1);
-    mp_print_str_dec("r", taba, nb);
-#endif
-
-    /* remove the normalization */
-    if (mult != 1) {
-        mp_div1_dec(taba, taba, nb, mult, 0);
-        if (unlikely(tabb != static_tabb))
-            bf_free(s, tabb);
-    }
-    return 0;
-}
-
-/* divide by 10^shift */
-static limb_t mp_shr_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n,
-                         limb_t shift, limb_t high)
-{
-    mp_size_t i;
-    limb_t l, a, q, r;
-
-    assert(shift >= 1 && shift < LIMB_DIGITS);
-    l = high;
-    for(i = n - 1; i >= 0; i--) {
-        a = tab[i];
-        fast_shr_rem_dec(q, r, a, shift);
-        tab_r[i] = q + l * mp_pow_dec[LIMB_DIGITS - shift];
-        l = r;
-    }
-    return l;
-}
-
-/* multiply by 10^shift */
-static limb_t mp_shl_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n,
-                         limb_t shift, limb_t low)
-{
-    mp_size_t i;
-    limb_t l, a, q, r;
-
-    assert(shift >= 1 && shift < LIMB_DIGITS);
-    l = low;
-    for(i = 0; i < n; i++) {
-        a = tab[i];
-        fast_shr_rem_dec(q, r, a, LIMB_DIGITS - shift);
-        tab_r[i] = r * mp_pow_dec[shift] + l;
-        l = q;
-    }
-    return l;
-}
-
-static limb_t mp_sqrtrem2_dec(limb_t *tabs, limb_t *taba)
-{
-    int k;
-    dlimb_t a, b, r;
-    limb_t taba1[2], s, r0, r1;
-
-    /* convert to binary and normalize */
-    a = (dlimb_t)taba[1] * BF_DEC_BASE + taba[0];
-    k = clz(a >> LIMB_BITS) & ~1;
-    b = a << k;
-    taba1[0] = b;
-    taba1[1] = b >> LIMB_BITS;
-    mp_sqrtrem2(&s, taba1);
-    s >>= (k >> 1);
-    /* convert the remainder back to decimal */
-    r = a - (dlimb_t)s * (dlimb_t)s;
-    divdq_base(r1, r0, r >> LIMB_BITS, r);
-    taba[0] = r0;
-    tabs[0] = s;
-    return r1;
-}
-
-//#define DEBUG_SQRTREM_DEC
-
-/* tmp_buf must contain (n / 2 + 1 limbs) */
-static limb_t mp_sqrtrem_rec_dec(limb_t *tabs, limb_t *taba, limb_t n,
-                                 limb_t *tmp_buf)
-{
-    limb_t l, h, rh, ql, qh, c, i;
-
-    if (n == 1)
-        return mp_sqrtrem2_dec(tabs, taba);
-#ifdef DEBUG_SQRTREM_DEC
-    mp_print_str_dec("a", taba, 2 * n);
-#endif
-    l = n / 2;
-    h = n - l;
-    qh = mp_sqrtrem_rec_dec(tabs + l, taba + 2 * l, h, tmp_buf);
-#ifdef DEBUG_SQRTREM_DEC
-    mp_print_str_dec("s1", tabs + l, h);
-    mp_print_str_h_dec("r1", taba + 2 * l, h, qh);
-    mp_print_str_h_dec("r2", taba + l, n, qh);
-#endif
-
-    /* the remainder is in taba + 2 * l. Its high bit is in qh */
-    if (qh) {
-        mp_sub_dec(taba + 2 * l, taba + 2 * l, tabs + l, h, 0);
-    }
-    /* instead of dividing by 2*s, divide by s (which is normalized)
-       and update q and r */
-    mp_div_dec(NULL, tmp_buf, taba + l, n, tabs + l, h);
-    qh += tmp_buf[l];
-    for(i = 0; i < l; i++)
-        tabs[i] = tmp_buf[i];
-    ql = mp_div1_dec(tabs, tabs, l, 2, qh & 1);
-    qh = qh >> 1; /* 0 or 1 */
-    if (ql)
-        rh = mp_add_dec(taba + l, taba + l, tabs + l, h, 0);
-    else
-        rh = 0;
-#ifdef DEBUG_SQRTREM_DEC
-    mp_print_str_h_dec("q", tabs, l, qh);
-    mp_print_str_h_dec("u", taba + l, h, rh);
-#endif
-
-    mp_add_ui_dec(tabs + l, qh, h);
-#ifdef DEBUG_SQRTREM_DEC
-    mp_print_str_dec("s2", tabs, n);
-#endif
-
-    /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */
-    /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */
-    if (qh) {
-        c = qh;
-    } else {
-        mp_mul_basecase_dec(taba + n, tabs, l, tabs, l);
-        c = mp_sub_dec(taba, taba, taba + n, 2 * l, 0);
-    }
-    rh -= mp_sub_ui_dec(taba + 2 * l, c, n - 2 * l);
-    if ((slimb_t)rh < 0) {
-        mp_sub_ui_dec(tabs, 1, n);
-        rh += mp_add_mul1_dec(taba, tabs, n, 2);
-        rh += mp_add_ui_dec(taba, 1, n);
-    }
-    return rh;
-}
-
-/* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= B/4. Return (s,
-   r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2 * s. tabs has n
-   limbs. r is returned in the lower n limbs of taba. Its r[n] is the
-   returned value of the function. */
-int mp_sqrtrem_dec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n)
-{
-    limb_t tmp_buf1[8];
-    limb_t *tmp_buf;
-    mp_size_t n2;
-    n2 = n / 2 + 1;
-    if (n2 <= countof(tmp_buf1)) {
-        tmp_buf = tmp_buf1;
-    } else {
-        tmp_buf = bf_malloc(s, sizeof(limb_t) * n2);
-        if (!tmp_buf)
-            return -1;
-    }
-    taba[n] = mp_sqrtrem_rec_dec(tabs, taba, n, tmp_buf);
-    if (tmp_buf != tmp_buf1)
-        bf_free(s, tmp_buf);
-    return 0;
-}
-
-/* return the number of leading zero digits, from 0 to LIMB_DIGITS */
-static int clz_dec(limb_t a)
-{
-    if (a == 0)
-        return LIMB_DIGITS;
-    switch(LIMB_BITS - 1 - clz(a)) {
-    case 0: /* 1-1 */
-        return LIMB_DIGITS - 1;
-    case 1: /* 2-3 */
-        return LIMB_DIGITS - 1;
-    case 2: /* 4-7 */
-        return LIMB_DIGITS - 1;
-    case 3: /* 8-15 */
-        if (a < 10)
-            return LIMB_DIGITS - 1;
-        else
-            return LIMB_DIGITS - 2;
-    case 4: /* 16-31 */
-        return LIMB_DIGITS - 2;
-    case 5: /* 32-63 */
-        return LIMB_DIGITS - 2;
-    case 6: /* 64-127 */
-        if (a < 100)
-            return LIMB_DIGITS - 2;
-        else
-            return LIMB_DIGITS - 3;
-    case 7: /* 128-255 */
-        return LIMB_DIGITS - 3;
-    case 8: /* 256-511 */
-        return LIMB_DIGITS - 3;
-    case 9: /* 512-1023 */
-        if (a < 1000)
-            return LIMB_DIGITS - 3;
-        else
-            return LIMB_DIGITS - 4;
-    case 10: /* 1024-2047 */
-        return LIMB_DIGITS - 4;
-    case 11: /* 2048-4095 */
-        return LIMB_DIGITS - 4;
-    case 12: /* 4096-8191 */
-        return LIMB_DIGITS - 4;
-    case 13: /* 8192-16383 */
-        if (a < 10000)
-            return LIMB_DIGITS - 4;
-        else
-            return LIMB_DIGITS - 5;
-    case 14: /* 16384-32767 */
-        return LIMB_DIGITS - 5;
-    case 15: /* 32768-65535 */
-        return LIMB_DIGITS - 5;
-    case 16: /* 65536-131071 */
-        if (a < 100000)
-            return LIMB_DIGITS - 5;
-        else
-            return LIMB_DIGITS - 6;
-    case 17: /* 131072-262143 */
-        return LIMB_DIGITS - 6;
-    case 18: /* 262144-524287 */
-        return LIMB_DIGITS - 6;
-    case 19: /* 524288-1048575 */
-        if (a < 1000000)
-            return LIMB_DIGITS - 6;
-        else
-            return LIMB_DIGITS - 7;
-    case 20: /* 1048576-2097151 */
-        return LIMB_DIGITS - 7;
-    case 21: /* 2097152-4194303 */
-        return LIMB_DIGITS - 7;
-    case 22: /* 4194304-8388607 */
-        return LIMB_DIGITS - 7;
-    case 23: /* 8388608-16777215 */
-        if (a < 10000000)
-            return LIMB_DIGITS - 7;
-        else
-            return LIMB_DIGITS - 8;
-    case 24: /* 16777216-33554431 */
-        return LIMB_DIGITS - 8;
-    case 25: /* 33554432-67108863 */
-        return LIMB_DIGITS - 8;
-    case 26: /* 67108864-134217727 */
-        if (a < 100000000)
-            return LIMB_DIGITS - 8;
-        else
-            return LIMB_DIGITS - 9;
-#if LIMB_BITS == 64
-    case 27: /* 134217728-268435455 */
-        return LIMB_DIGITS - 9;
-    case 28: /* 268435456-536870911 */
-        return LIMB_DIGITS - 9;
-    case 29: /* 536870912-1073741823 */
-        if (a < 1000000000)
-            return LIMB_DIGITS - 9;
-        else
-            return LIMB_DIGITS - 10;
-    case 30: /* 1073741824-2147483647 */
-        return LIMB_DIGITS - 10;
-    case 31: /* 2147483648-4294967295 */
-        return LIMB_DIGITS - 10;
-    case 32: /* 4294967296-8589934591 */
-        return LIMB_DIGITS - 10;
-    case 33: /* 8589934592-17179869183 */
-        if (a < 10000000000)
-            return LIMB_DIGITS - 10;
-        else
-            return LIMB_DIGITS - 11;
-    case 34: /* 17179869184-34359738367 */
-        return LIMB_DIGITS - 11;
-    case 35: /* 34359738368-68719476735 */
-        return LIMB_DIGITS - 11;
-    case 36: /* 68719476736-137438953471 */
-        if (a < 100000000000)
-            return LIMB_DIGITS - 11;
-        else
-            return LIMB_DIGITS - 12;
-    case 37: /* 137438953472-274877906943 */
-        return LIMB_DIGITS - 12;
-    case 38: /* 274877906944-549755813887 */
-        return LIMB_DIGITS - 12;
-    case 39: /* 549755813888-1099511627775 */
-        if (a < 1000000000000)
-            return LIMB_DIGITS - 12;
-        else
-            return LIMB_DIGITS - 13;
-    case 40: /* 1099511627776-2199023255551 */
-        return LIMB_DIGITS - 13;
-    case 41: /* 2199023255552-4398046511103 */
-        return LIMB_DIGITS - 13;
-    case 42: /* 4398046511104-8796093022207 */
-        return LIMB_DIGITS - 13;
-    case 43: /* 8796093022208-17592186044415 */
-        if (a < 10000000000000)
-            return LIMB_DIGITS - 13;
-        else
-            return LIMB_DIGITS - 14;
-    case 44: /* 17592186044416-35184372088831 */
-        return LIMB_DIGITS - 14;
-    case 45: /* 35184372088832-70368744177663 */
-        return LIMB_DIGITS - 14;
-    case 46: /* 70368744177664-140737488355327 */
-        if (a < 100000000000000)
-            return LIMB_DIGITS - 14;
-        else
-            return LIMB_DIGITS - 15;
-    case 47: /* 140737488355328-281474976710655 */
-        return LIMB_DIGITS - 15;
-    case 48: /* 281474976710656-562949953421311 */
-        return LIMB_DIGITS - 15;
-    case 49: /* 562949953421312-1125899906842623 */
-        if (a < 1000000000000000)
-            return LIMB_DIGITS - 15;
-        else
-            return LIMB_DIGITS - 16;
-    case 50: /* 1125899906842624-2251799813685247 */
-        return LIMB_DIGITS - 16;
-    case 51: /* 2251799813685248-4503599627370495 */
-        return LIMB_DIGITS - 16;
-    case 52: /* 4503599627370496-9007199254740991 */
-        return LIMB_DIGITS - 16;
-    case 53: /* 9007199254740992-18014398509481983 */
-        if (a < 10000000000000000)
-            return LIMB_DIGITS - 16;
-        else
-            return LIMB_DIGITS - 17;
-    case 54: /* 18014398509481984-36028797018963967 */
-        return LIMB_DIGITS - 17;
-    case 55: /* 36028797018963968-72057594037927935 */
-        return LIMB_DIGITS - 17;
-    case 56: /* 72057594037927936-144115188075855871 */
-        if (a < 100000000000000000)
-            return LIMB_DIGITS - 17;
-        else
-            return LIMB_DIGITS - 18;
-    case 57: /* 144115188075855872-288230376151711743 */
-        return LIMB_DIGITS - 18;
-    case 58: /* 288230376151711744-576460752303423487 */
-        return LIMB_DIGITS - 18;
-    case 59: /* 576460752303423488-1152921504606846975 */
-        if (a < 1000000000000000000)
-            return LIMB_DIGITS - 18;
-        else
-            return LIMB_DIGITS - 19;
-#endif
-    default:
-        return 0;
-    }
-}
-
-/* for debugging */
-void bfdec_print_str(const char *str, const bfdec_t *a)
-{
-    slimb_t i;
-    printf("%s=", str);
-
-    if (a->expn == BF_EXP_NAN) {
-        printf("NaN");
-    } else {
-        if (a->sign)
-            putchar('-');
-        if (a->expn == BF_EXP_ZERO) {
-            putchar('0');
-        } else if (a->expn == BF_EXP_INF) {
-            printf("Inf");
-        } else {
-            printf("0.");
-            for(i = a->len - 1; i >= 0; i--)
-                printf("%0*" PRIu_LIMB, LIMB_DIGITS, a->tab[i]);
-            printf("e%" PRId_LIMB, a->expn);
-        }
-    }
-    printf("\n");
-}
-
-/* return != 0 if one digit between 0 and bit_pos inclusive is not zero. */
-static inline limb_t scan_digit_nz(const bfdec_t *r, slimb_t bit_pos)
-{
-    slimb_t pos;
-    limb_t v, q;
-    int shift;
-
-    if (bit_pos < 0)
-        return 0;
-    pos = (limb_t)bit_pos / LIMB_DIGITS;
-    shift = (limb_t)bit_pos % LIMB_DIGITS;
-    fast_shr_rem_dec(q, v, r->tab[pos], shift + 1);
-    (void)q;
-    if (v != 0)
-        return 1;
-    pos--;
-    while (pos >= 0) {
-        if (r->tab[pos] != 0)
-            return 1;
-        pos--;
-    }
-    return 0;
-}
-
-static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos)
-{
-    slimb_t i;
-    int shift;
-    i = floor_div(pos, LIMB_DIGITS);
-    if (i < 0 || i >= len)
-        return 0;
-    shift = pos - i * LIMB_DIGITS;
-    return fast_shr_dec(tab[i], shift) % 10;
-}
-
-/* return the addend for rounding. Note that prec can be <= 0 for bf_rint() */
-static int bfdec_get_rnd_add(int *pret, const bfdec_t *r, limb_t l,
-                             slimb_t prec, int rnd_mode)
-{
-    int add_one, inexact;
-    limb_t digit1, digit0;
-
-    //    bfdec_print_str("get_rnd_add", r);
-    if (rnd_mode == BF_RNDF) {
-        digit0 = 1; /* faithful rounding does not honor the INEXACT flag */
-    } else {
-        /* starting limb for bit 'prec + 1' */
-        digit0 = scan_digit_nz(r, l * LIMB_DIGITS - 1 - bf_max(0, prec + 1));
-    }
-
-    /* get the digit at 'prec' */
-    digit1 = get_digit(r->tab, l, l * LIMB_DIGITS - 1 - prec);
-    inexact = (digit1 | digit0) != 0;
-
-    add_one = 0;
-    switch(rnd_mode) {
-    case BF_RNDZ:
-        break;
-    case BF_RNDN:
-        if (digit1 == 5) {
-            if (digit0) {
-                add_one = 1;
-            } else {
-                /* round to even */
-                add_one =
-                    get_digit(r->tab, l, l * LIMB_DIGITS - 1 - (prec - 1)) & 1;
-            }
-        } else if (digit1 > 5) {
-            add_one = 1;
-        }
-        break;
-    case BF_RNDD:
-    case BF_RNDU:
-        if (r->sign == (rnd_mode == BF_RNDD))
-            add_one = inexact;
-        break;
-    case BF_RNDNA:
-    case BF_RNDF:
-        add_one = (digit1 >= 5);
-        break;
-    case BF_RNDA:
-        add_one = inexact;
-        break;
-    default:
-        abort();
-    }
-
-    if (inexact)
-        *pret |= BF_ST_INEXACT;
-    return add_one;
-}
-
-/* round to prec1 bits assuming 'r' is non zero and finite. 'r' is
-   assumed to have length 'l' (1 <= l <= r->len). prec1 can be
-   BF_PREC_INF. BF_FLAG_SUBNORMAL is not supported. Cannot fail with
-   BF_ST_MEM_ERROR.
- */
-static int __bfdec_round(bfdec_t *r, limb_t prec1, bf_flags_t flags, limb_t l)
-{
-    int shift, add_one, rnd_mode, ret;
-    slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec;
-
-    /* XXX: align to IEEE 754 2008 for decimal numbers ? */
-    e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1);
-    e_min = -e_range + 3;
-    e_max = e_range;
-
-    if (flags & BF_FLAG_RADPNT_PREC) {
-        /* 'prec' is the precision after the decimal point */
-        if (prec1 != BF_PREC_INF)
-            prec = r->expn + prec1;
-        else
-            prec = prec1;
-    } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) {
-        /* restrict the precision in case of potentially subnormal
-           result */
-        assert(prec1 != BF_PREC_INF);
-        prec = prec1 - (e_min - r->expn);
-    } else {
-        prec = prec1;
-    }
-
-    /* round to prec bits */
-    rnd_mode = flags & BF_RND_MASK;
-    ret = 0;
-    add_one = bfdec_get_rnd_add(&ret, r, l, prec, rnd_mode);
-
-    if (prec <= 0) {
-        if (add_one) {
-            bfdec_resize(r, 1); /* cannot fail because r is non zero */
-            r->tab[0] = BF_DEC_BASE / 10;
-            r->expn += 1 - prec;
-            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
-            return ret;
-        } else {
-            goto underflow;
-        }
-    } else if (add_one) {
-        limb_t carry;
-
-        /* add one starting at digit 'prec - 1' */
-        bit_pos = l * LIMB_DIGITS - 1 - (prec - 1);
-        pos = bit_pos / LIMB_DIGITS;
-        carry = mp_pow_dec[bit_pos % LIMB_DIGITS];
-        carry = mp_add_ui_dec(r->tab + pos, carry, l - pos);
-        if (carry) {
-            /* shift right by one digit */
-            mp_shr_dec(r->tab + pos, r->tab + pos, l - pos, 1, 1);
-            r->expn++;
-        }
-    }
-
-    /* check underflow */
-    if (unlikely(r->expn < e_min)) {
-        if (flags & BF_FLAG_SUBNORMAL) {
-            /* if inexact, also set the underflow flag */
-            if (ret & BF_ST_INEXACT)
-                ret |= BF_ST_UNDERFLOW;
-        } else {
-        underflow:
-            bfdec_set_zero(r, r->sign);
-            ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT;
-            return ret;
-        }
-    }
-
-    /* check overflow */
-    if (unlikely(r->expn > e_max)) {
-        bfdec_set_inf(r, r->sign);
-        ret |= BF_ST_OVERFLOW | BF_ST_INEXACT;
-        return ret;
-    }
-
-    /* keep the bits starting at 'prec - 1' */
-    bit_pos = l * LIMB_DIGITS - 1 - (prec - 1);
-    i = floor_div(bit_pos, LIMB_DIGITS);
-    if (i >= 0) {
-        shift = smod(bit_pos, LIMB_DIGITS);
-        if (shift != 0) {
-            r->tab[i] = fast_shr_dec(r->tab[i], shift) *
-                mp_pow_dec[shift];
-        }
-    } else {
-        i = 0;
-    }
-    /* remove trailing zeros */
-    while (r->tab[i] == 0)
-        i++;
-    if (i > 0) {
-        l -= i;
-        memmove(r->tab, r->tab + i, l * sizeof(limb_t));
-    }
-    bfdec_resize(r, l); /* cannot fail */
-    return ret;
-}
-
-/* Cannot fail with BF_ST_MEM_ERROR. */
-int bfdec_round(bfdec_t *r, limb_t prec, bf_flags_t flags)
-{
-    if (r->len == 0)
-        return 0;
-    return __bfdec_round(r, prec, flags, r->len);
-}
-
-/* 'r' must be a finite number. Cannot fail with BF_ST_MEM_ERROR.  */
-int bfdec_normalize_and_round(bfdec_t *r, limb_t prec1, bf_flags_t flags)
-{
-    limb_t l, v;
-    int shift, ret;
-
-    //    bfdec_print_str("bf_renorm", r);
-    l = r->len;
-    while (l > 0 && r->tab[l - 1] == 0)
-        l--;
-    if (l == 0) {
-        /* zero */
-        r->expn = BF_EXP_ZERO;
-        bfdec_resize(r, 0); /* cannot fail */
-        ret = 0;
-    } else {
-        r->expn -= (r->len - l) * LIMB_DIGITS;
-        /* shift to have the MSB set to '1' */
-        v = r->tab[l - 1];
-        shift = clz_dec(v);
-        if (shift != 0) {
-            mp_shl_dec(r->tab, r->tab, l, shift, 0);
-            r->expn -= shift;
-        }
-        ret = __bfdec_round(r, prec1, flags, l);
-    }
-    //    bf_print_str("r_final", r);
-    return ret;
-}
-
-int bfdec_set_ui(bfdec_t *r, uint64_t v)
-{
-#if LIMB_BITS == 32
-    if (v >= BF_DEC_BASE * BF_DEC_BASE) {
-        if (bfdec_resize(r, 3))
-            goto fail;
-        r->tab[0] = v % BF_DEC_BASE;
-        v /= BF_DEC_BASE;
-        r->tab[1] = v % BF_DEC_BASE;
-        r->tab[2] = v / BF_DEC_BASE;
-        r->expn = 3 * LIMB_DIGITS;
-    } else
-#endif
-    if (v >= BF_DEC_BASE) {
-        if (bfdec_resize(r, 2))
-            goto fail;
-        r->tab[0] = v % BF_DEC_BASE;
-        r->tab[1] = v / BF_DEC_BASE;
-        r->expn = 2 * LIMB_DIGITS;
-    } else {
-        if (bfdec_resize(r, 1))
-            goto fail;
-        r->tab[0] = v;
-        r->expn = LIMB_DIGITS;
-    }
-    r->sign = 0;
-    return bfdec_normalize_and_round(r, BF_PREC_INF, 0);
- fail:
-    bfdec_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-int bfdec_set_si(bfdec_t *r, int64_t v)
-{
-    int ret;
-    if (v < 0) {
-        ret = bfdec_set_ui(r, -v);
-        r->sign = 1;
-    } else {
-        ret = bfdec_set_ui(r, v);
-    }
-    return ret;
-}
-
-static int bfdec_add_internal(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, bf_flags_t flags, int b_neg)
-{
-    bf_context_t *s = r->ctx;
-    int is_sub, cmp_res, a_sign, b_sign, ret;
-
-    a_sign = a->sign;
-    b_sign = b->sign ^ b_neg;
-    is_sub = a_sign ^ b_sign;
-    cmp_res = bfdec_cmpu(a, b);
-    if (cmp_res < 0) {
-        const bfdec_t *tmp;
-        tmp = a;
-        a = b;
-        b = tmp;
-        a_sign = b_sign; /* b_sign is never used later */
-    }
-    /* abs(a) >= abs(b) */
-    if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) {
-        /* zero result */
-        bfdec_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD);
-        ret = 0;
-    } else if (a->len == 0 || b->len == 0) {
-        ret = 0;
-        if (a->expn >= BF_EXP_INF) {
-            if (a->expn == BF_EXP_NAN) {
-                /* at least one operand is NaN */
-                bfdec_set_nan(r);
-                ret = 0;
-            } else if (b->expn == BF_EXP_INF && is_sub) {
-                /* infinities with different signs */
-                bfdec_set_nan(r);
-                ret = BF_ST_INVALID_OP;
-            } else {
-                bfdec_set_inf(r, a_sign);
-            }
-        } else {
-            /* at least one zero and not subtract */
-            if (bfdec_set(r, a))
-                return BF_ST_MEM_ERROR;
-            r->sign = a_sign;
-            goto renorm;
-        }
-    } else {
-        slimb_t d, a_offset, b_offset, i, r_len;
-        limb_t carry;
-        limb_t *b1_tab;
-        int b_shift;
-        mp_size_t b1_len;
-
-        d = a->expn - b->expn;
-
-        /* XXX: not efficient in time and memory if the precision is
-           not infinite */
-        r_len = bf_max(a->len, b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS);
-        if (bfdec_resize(r, r_len))
-            goto fail;
-        r->sign = a_sign;
-        r->expn = a->expn;
-
-        a_offset = r_len - a->len;
-        for(i = 0; i < a_offset; i++)
-            r->tab[i] = 0;
-        for(i = 0; i < a->len; i++)
-            r->tab[a_offset + i] = a->tab[i];
-
-        b_shift = d % LIMB_DIGITS;
-        if (b_shift == 0) {
-            b1_len = b->len;
-            b1_tab = (limb_t *)b->tab;
-        } else {
-            b1_len = b->len + 1;
-            b1_tab = bf_malloc(s, sizeof(limb_t) * b1_len);
-            if (!b1_tab)
-                goto fail;
-            b1_tab[0] = mp_shr_dec(b1_tab + 1, b->tab, b->len, b_shift, 0) *
-                mp_pow_dec[LIMB_DIGITS - b_shift];
-        }
-        b_offset = r_len - (b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS);
-
-        if (is_sub) {
-            carry = mp_sub_dec(r->tab + b_offset, r->tab + b_offset,
-                               b1_tab, b1_len, 0);
-            if (carry != 0) {
-                carry = mp_sub_ui_dec(r->tab + b_offset + b1_len, carry,
-                                      r_len - (b_offset + b1_len));
-                assert(carry == 0);
-            }
-        } else {
-            carry = mp_add_dec(r->tab + b_offset, r->tab + b_offset,
-                               b1_tab, b1_len, 0);
-            if (carry != 0) {
-                carry = mp_add_ui_dec(r->tab + b_offset + b1_len, carry,
-                                      r_len - (b_offset + b1_len));
-            }
-            if (carry != 0) {
-                if (bfdec_resize(r, r_len + 1)) {
-                    if (b_shift != 0)
-                        bf_free(s, b1_tab);
-                    goto fail;
-                }
-                r->tab[r_len] = 1;
-                r->expn += LIMB_DIGITS;
-            }
-        }
-        if (b_shift != 0)
-            bf_free(s, b1_tab);
-    renorm:
-        ret = bfdec_normalize_and_round(r, prec, flags);
-    }
-    return ret;
- fail:
-    bfdec_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-static int __bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-                     bf_flags_t flags)
-{
-    return bfdec_add_internal(r, a, b, prec, flags, 0);
-}
-
-static int __bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-                     bf_flags_t flags)
-{
-    return bfdec_add_internal(r, a, b, prec, flags, 1);
-}
-
-int bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags)
-{
-    return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags,
-                  (bf_op2_func_t *)__bfdec_add);
-}
-
-int bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags)
-{
-    return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags,
-                  (bf_op2_func_t *)__bfdec_sub);
-}
-
-int bfdec_mul(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags)
-{
-    int ret, r_sign;
-
-    if (a->len < b->len) {
-        const bfdec_t *tmp = a;
-        a = b;
-        b = tmp;
-    }
-    r_sign = a->sign ^ b->sign;
-    /* here b->len <= a->len */
-    if (b->len == 0) {
-        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-            bfdec_set_nan(r);
-            ret = 0;
-        } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) {
-            if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) ||
-                (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) {
-                bfdec_set_nan(r);
-                ret = BF_ST_INVALID_OP;
-            } else {
-                bfdec_set_inf(r, r_sign);
-                ret = 0;
-            }
-        } else {
-            bfdec_set_zero(r, r_sign);
-            ret = 0;
-        }
-    } else {
-        bfdec_t tmp, *r1 = NULL;
-        limb_t a_len, b_len;
-        limb_t *a_tab, *b_tab;
-
-        a_len = a->len;
-        b_len = b->len;
-        a_tab = a->tab;
-        b_tab = b->tab;
-
-        if (r == a || r == b) {
-            bfdec_init(r->ctx, &tmp);
-            r1 = r;
-            r = &tmp;
-        }
-        if (bfdec_resize(r, a_len + b_len)) {
-            bfdec_set_nan(r);
-            ret = BF_ST_MEM_ERROR;
-            goto done;
-        }
-        mp_mul_basecase_dec(r->tab, a_tab, a_len, b_tab, b_len);
-        r->sign = r_sign;
-        r->expn = a->expn + b->expn;
-        ret = bfdec_normalize_and_round(r, prec, flags);
-    done:
-        if (r == &tmp)
-            bfdec_move(r1, &tmp);
-    }
-    return ret;
-}
-
-int bfdec_mul_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec,
-                 bf_flags_t flags)
-{
-    bfdec_t b;
-    int ret;
-    bfdec_init(r->ctx, &b);
-    ret = bfdec_set_si(&b, b1);
-    ret |= bfdec_mul(r, a, &b, prec, flags);
-    bfdec_delete(&b);
-    return ret;
-}
-
-int bfdec_add_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec,
-                 bf_flags_t flags)
-{
-    bfdec_t b;
-    int ret;
-
-    bfdec_init(r->ctx, &b);
-    ret = bfdec_set_si(&b, b1);
-    ret |= bfdec_add(r, a, &b, prec, flags);
-    bfdec_delete(&b);
-    return ret;
-}
-
-static int __bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b,
-                       limb_t prec, bf_flags_t flags)
-{
-    int ret, r_sign;
-    limb_t n, nb, precl;
-
-    r_sign = a->sign ^ b->sign;
-    if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) {
-        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-            bfdec_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) {
-            bfdec_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else if (a->expn == BF_EXP_INF) {
-            bfdec_set_inf(r, r_sign);
-            return 0;
-        } else {
-            bfdec_set_zero(r, r_sign);
-            return 0;
-        }
-    } else if (a->expn == BF_EXP_ZERO) {
-        if (b->expn == BF_EXP_ZERO) {
-            bfdec_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bfdec_set_zero(r, r_sign);
-            return 0;
-        }
-    } else if (b->expn == BF_EXP_ZERO) {
-        bfdec_set_inf(r, r_sign);
-        return BF_ST_DIVIDE_ZERO;
-    }
-
-    nb = b->len;
-    if (prec == BF_PREC_INF) {
-        /* infinite precision: return BF_ST_INVALID_OP if not an exact
-           result */
-        /* XXX: check */
-        precl = nb + 1;
-    } else if (flags & BF_FLAG_RADPNT_PREC) {
-        /* number of digits after the decimal point */
-        /* XXX: check (2 extra digits for rounding + 2 digits) */
-        precl = (bf_max(a->expn - b->expn, 0) + 2 +
-                 prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS;
-    } else {
-        /* number of limbs of the quotient (2 extra digits for rounding) */
-        precl = (prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS;
-    }
-    n = bf_max(a->len, precl);
-
-    {
-        limb_t *taba, na, i;
-        slimb_t d;
-
-        na = n + nb;
-        taba = bf_malloc(r->ctx, (na + 1) * sizeof(limb_t));
-        if (!taba)
-            goto fail;
-        d = na - a->len;
-        memset(taba, 0, d * sizeof(limb_t));
-        memcpy(taba + d, a->tab, a->len * sizeof(limb_t));
-        if (bfdec_resize(r, n + 1))
-            goto fail1;
-        if (mp_div_dec(r->ctx, r->tab, taba, na, b->tab, nb)) {
-        fail1:
-            bf_free(r->ctx, taba);
-            goto fail;
-        }
-        /* see if non zero remainder */
-        for(i = 0; i < nb; i++) {
-            if (taba[i] != 0)
-                break;
-        }
-        bf_free(r->ctx, taba);
-        if (i != nb) {
-            if (prec == BF_PREC_INF) {
-                bfdec_set_nan(r);
-                return BF_ST_INVALID_OP;
-            } else {
-                r->tab[0] |= 1;
-            }
-        }
-        r->expn = a->expn - b->expn + LIMB_DIGITS;
-        r->sign = r_sign;
-        ret = bfdec_normalize_and_round(r, prec, flags);
-    }
-    return ret;
- fail:
-    bfdec_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-int bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags)
-{
-    return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags,
-                  (bf_op2_func_t *)__bfdec_div);
-}
-
-/* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the
-   integer defined as floor(a/b) and r = a - q * b. */
-static void bfdec_tdivremu(bf_context_t *s, bfdec_t *q, bfdec_t *r,
-                           const bfdec_t *a, const bfdec_t *b)
-{
-    if (bfdec_cmpu(a, b) < 0) {
-        bfdec_set_ui(q, 0);
-        bfdec_set(r, a);
-    } else {
-        bfdec_div(q, a, b, 0, BF_RNDZ | BF_FLAG_RADPNT_PREC);
-        bfdec_mul(r, q, b, BF_PREC_INF, BF_RNDZ);
-        bfdec_sub(r, a, r, BF_PREC_INF, BF_RNDZ);
-    }
-}
-
-/* division and remainder.
-
-   rnd_mode is the rounding mode for the quotient. The additional
-   rounding mode BF_RND_EUCLIDIAN is supported.
-
-   'q' is an integer. 'r' is rounded with prec and flags (prec can be
-   BF_PREC_INF).
-*/
-int bfdec_divrem(bfdec_t *q, bfdec_t *r, const bfdec_t *a, const bfdec_t *b,
-                 limb_t prec, bf_flags_t flags, int rnd_mode)
-{
-    bf_context_t *s = q->ctx;
-    bfdec_t a1_s, *a1 = &a1_s;
-    bfdec_t b1_s, *b1 = &b1_s;
-    bfdec_t r1_s, *r1 = &r1_s;
-    int q_sign, res;
-    bool is_ceil, is_rndn;
-
-    assert(q != a && q != b);
-    assert(r != a && r != b);
-    assert(q != r);
-
-    if (a->len == 0 || b->len == 0) {
-        bfdec_set_zero(q, 0);
-        if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) {
-            bfdec_set_nan(r);
-            return 0;
-        } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) {
-            bfdec_set_nan(r);
-            return BF_ST_INVALID_OP;
-        } else {
-            bfdec_set(r, a);
-            return bfdec_round(r, prec, flags);
-        }
-    }
-
-    q_sign = a->sign ^ b->sign;
-    is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA);
-    switch(rnd_mode) {
-    default:
-    case BF_RNDZ:
-    case BF_RNDN:
-    case BF_RNDNA:
-        is_ceil = false;
-        break;
-    case BF_RNDD:
-        is_ceil = q_sign;
-        break;
-    case BF_RNDU:
-        is_ceil = q_sign ^ 1;
-        break;
-    case BF_RNDA:
-        is_ceil = true;
-        break;
-    case BF_DIVREM_EUCLIDIAN:
-        is_ceil = a->sign;
-        break;
-    }
-
-    a1->expn = a->expn;
-    a1->tab = a->tab;
-    a1->len = a->len;
-    a1->sign = 0;
-
-    b1->expn = b->expn;
-    b1->tab = b->tab;
-    b1->len = b->len;
-    b1->sign = 0;
-
-    //    bfdec_print_str("a1", a1);
-    //    bfdec_print_str("b1", b1);
-    /* XXX: could improve to avoid having a large 'q' */
-    bfdec_tdivremu(s, q, r, a1, b1);
-    if (bfdec_is_nan(q) || bfdec_is_nan(r))
-        goto fail;
-    //    bfdec_print_str("q", q);
-    //    bfdec_print_str("r", r);
-
-    if (r->len != 0) {
-        if (is_rndn) {
-            bfdec_init(s, r1);
-            if (bfdec_set(r1, r))
-                goto fail;
-            if (bfdec_mul_si(r1, r1, 2, BF_PREC_INF, BF_RNDZ)) {
-                bfdec_delete(r1);
-                goto fail;
-            }
-            res = bfdec_cmpu(r1, b);
-            bfdec_delete(r1);
-            if (res > 0 ||
-                (res == 0 &&
-                 (rnd_mode == BF_RNDNA ||
-                  (get_digit(q->tab, q->len, q->len * LIMB_DIGITS - q->expn) & 1) != 0))) {
-                goto do_sub_r;
-            }
-        } else if (is_ceil) {
-        do_sub_r:
-            res = bfdec_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ);
-            res |= bfdec_sub(r, r, b1, BF_PREC_INF, BF_RNDZ);
-            if (res & BF_ST_MEM_ERROR)
-                goto fail;
-        }
-    }
-
-    r->sign ^= a->sign;
-    q->sign = q_sign;
-    return bfdec_round(r, prec, flags);
- fail:
-    bfdec_set_nan(q);
-    bfdec_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-int bfdec_rem(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags, int rnd_mode)
-{
-    bfdec_t q_s, *q = &q_s;
-    int ret;
-
-    bfdec_init(r->ctx, q);
-    ret = bfdec_divrem(q, r, a, b, prec, flags, rnd_mode);
-    bfdec_delete(q);
-    return ret;
-}
-
-/* convert to integer (infinite precision) */
-int bfdec_rint(bfdec_t *r, int rnd_mode)
-{
-    return bfdec_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC);
-}
-
-int bfdec_sqrt(bfdec_t *r, const bfdec_t *a, limb_t prec, bf_flags_t flags)
-{
-    bf_context_t *s = a->ctx;
-    int ret, k;
-    limb_t *a1, v;
-    slimb_t n, n1, prec1;
-    limb_t res;
-
-    assert(r != a);
-
-    if (a->len == 0) {
-        if (a->expn == BF_EXP_NAN) {
-            bfdec_set_nan(r);
-        } else if (a->expn == BF_EXP_INF && a->sign) {
-            goto invalid_op;
-        } else {
-            bfdec_set(r, a);
-        }
-        ret = 0;
-    } else if (a->sign || prec == BF_PREC_INF) {
- invalid_op:
-        bfdec_set_nan(r);
-        ret = BF_ST_INVALID_OP;
-    } else {
-        if (flags & BF_FLAG_RADPNT_PREC) {
-            prec1 = bf_max(floor_div(a->expn + 1, 2) + prec, 1);
-        } else {
-            prec1 = prec;
-        }
-        /* convert the mantissa to an integer with at least 2 *
-           prec + 4 digits */
-        n = (2 * (prec1 + 2) + 2 * LIMB_DIGITS - 1) / (2 * LIMB_DIGITS);
-        if (bfdec_resize(r, n))
-            goto fail;
-        a1 = bf_malloc(s, sizeof(limb_t) * 2 * n);
-        if (!a1)
-            goto fail;
-        n1 = bf_min(2 * n, a->len);
-        memset(a1, 0, (2 * n - n1) * sizeof(limb_t));
-        memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t));
-        if (a->expn & 1) {
-            res = mp_shr_dec(a1, a1, 2 * n, 1, 0);
-        } else {
-            res = 0;
-        }
-        /* normalize so that a1 >= B^(2*n)/4. Not need for n = 1
-           because mp_sqrtrem2_dec already does it */
-        k = 0;
-        if (n > 1) {
-            v = a1[2 * n - 1];
-            while (v < BF_DEC_BASE / 4) {
-                k++;
-                v *= 4;
-            }
-            if (k != 0)
-                mp_mul1_dec(a1, a1, 2 * n, 1 << (2 * k), 0);
-        }
-        if (mp_sqrtrem_dec(s, r->tab, a1, n)) {
-            bf_free(s, a1);
-            goto fail;
-        }
-        if (k != 0)
-            mp_div1_dec(r->tab, r->tab, n, 1 << k, 0);
-        if (!res) {
-            res = mp_scan_nz(a1, n + 1);
-        }
-        bf_free(s, a1);
-        if (!res) {
-            res = mp_scan_nz(a->tab, a->len - n1);
-        }
-        if (res != 0)
-            r->tab[0] |= 1;
-        r->sign = 0;
-        r->expn = (a->expn + 1) >> 1;
-        ret = bfdec_round(r, prec, flags);
-    }
-    return ret;
- fail:
-    bfdec_set_nan(r);
-    return BF_ST_MEM_ERROR;
-}
-
-/* The rounding mode is always BF_RNDZ. Return BF_ST_OVERFLOW if there
-   is an overflow and 0 otherwise. No memory error is possible. */
-int bfdec_get_int32(int *pres, const bfdec_t *a)
-{
-    uint32_t v;
-    int ret;
-    if (a->expn >= BF_EXP_INF) {
-        ret = 0;
-        if (a->expn == BF_EXP_INF) {
-            v = (uint32_t)INT32_MAX + a->sign;
-             /* XXX: return overflow ? */
-        } else {
-            v = INT32_MAX;
-        }
-    } else if (a->expn <= 0) {
-        v = 0;
-        ret = 0;
-    } else if (a->expn <= 9) {
-        v = fast_shr_dec(a->tab[a->len - 1], LIMB_DIGITS - a->expn);
-        if (a->sign)
-            v = -v;
-        ret = 0;
-    } else if (a->expn == 10) {
-        uint64_t v1;
-        uint32_t v_max;
-#if LIMB_BITS == 64
-        v1 = fast_shr_dec(a->tab[a->len - 1], LIMB_DIGITS - a->expn);
-#else
-        v1 = (uint64_t)a->tab[a->len - 1] * 10 +
-            get_digit(a->tab, a->len, (a->len - 1) * LIMB_DIGITS - 1);
-#endif
-        v_max = (uint32_t)INT32_MAX + a->sign;
-        if (v1 > v_max) {
-            v = v_max;
-            ret = BF_ST_OVERFLOW;
-        } else {
-            v = v1;
-            if (a->sign)
-                v = -v;
-            ret = 0;
-        }
-    } else {
-        v = (uint32_t)INT32_MAX + a->sign;
-        ret = BF_ST_OVERFLOW;
-    }
-    *pres = v;
-    return ret;
-}
-
-/* power to an integer with infinite precision */
-int bfdec_pow_ui(bfdec_t *r, const bfdec_t *a, limb_t b)
-{
-    int ret, n_bits, i;
-
-    assert(r != a);
-    if (b == 0)
-        return bfdec_set_ui(r, 1);
-    ret = bfdec_set(r, a);
-    n_bits = LIMB_BITS - clz(b);
-    for(i = n_bits - 2; i >= 0; i--) {
-        ret |= bfdec_mul(r, r, r, BF_PREC_INF, BF_RNDZ);
-        if ((b >> i) & 1)
-            ret |= bfdec_mul(r, r, a, BF_PREC_INF, BF_RNDZ);
-    }
-    return ret;
-}
-
-char *bfdec_ftoa(size_t *plen, const bfdec_t *a, limb_t prec, bf_flags_t flags)
-{
-    return bf_ftoa_internal(plen, (const bf_t *)a, 10, prec, flags, true);
-}
-
-int bfdec_atof(bfdec_t *r, const char *str, const char **pnext,
-               limb_t prec, bf_flags_t flags)
-{
-    slimb_t dummy_exp;
-    return bf_atof_internal((bf_t *)r, &dummy_exp, str, pnext, 10, prec,
-                            flags, true);
-}
-
-#endif /* USE_BF_DEC */
-
-#ifdef USE_FFT_MUL
-/***************************************************************/
-/* Integer multiplication with FFT */
-
-/* or LIMB_BITS at bit position 'pos' in tab */
-static inline void put_bits(limb_t *tab, limb_t len, slimb_t pos, limb_t val)
-{
-    limb_t i;
-    int p;
-
-    i = pos >> LIMB_LOG2_BITS;
-    p = pos & (LIMB_BITS - 1);
-    if (i < len)
-        tab[i] |= val << p;
-    if (p != 0) {
-        i++;
-        if (i < len) {
-            tab[i] |= val >> (LIMB_BITS - p);
-        }
-    }
-}
-
-#if defined(__AVX2__)
-
-typedef double NTTLimb;
-
-/* we must have: modulo >= 1 << NTT_MOD_LOG2_MIN */
-#define NTT_MOD_LOG2_MIN 50
-#define NTT_MOD_LOG2_MAX 51
-#define NB_MODS 5
-#define NTT_PROOT_2EXP 39
-static const int ntt_int_bits[NB_MODS] = { 254, 203, 152, 101, 50, };
-
-static const limb_t ntt_mods[NB_MODS] = { 0x00073a8000000001, 0x0007858000000001, 0x0007a38000000001, 0x0007a68000000001, 0x0007fd8000000001,
-};
-
-static const limb_t ntt_proot[2][NB_MODS] = {
-    { 0x00056198d44332c8, 0x0002eb5d640aad39, 0x00047e31eaa35fd0, 0x0005271ac118a150, 0x00075e0ce8442bd5, },
-    { 0x000461169761bcc5, 0x0002dac3cb2da688, 0x0004abc97751e3bf, 0x000656778fc8c485, 0x0000dc6469c269fa, },
-};
-
-static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = {
- 0x00020e4da740da8e, 0x0004c3dc09c09c1d, 0x000063bd097b4271, 0x000799d8f18f18fd,
- 0x0005384222222264, 0x000572b07c1f07fe, 0x00035cd08888889a,
- 0x00066015555557e3, 0x000725960b60b623,
- 0x0002fc1fa1d6ce12,
-};
-
-#else
-
-typedef limb_t NTTLimb;
-
-#if LIMB_BITS == 64
-
-#define NTT_MOD_LOG2_MIN 61
-#define NTT_MOD_LOG2_MAX 62
-#define NB_MODS 5
-#define NTT_PROOT_2EXP 51
-static const int ntt_int_bits[NB_MODS] = { 307, 246, 185, 123, 61, };
-
-static const limb_t ntt_mods[NB_MODS] = { 0x28d8000000000001, 0x2a88000000000001, 0x2ed8000000000001, 0x3508000000000001, 0x3aa8000000000001,
-};
-
-static const limb_t ntt_proot[2][NB_MODS] = {
-    { 0x1b8ea61034a2bea7, 0x21a9762de58206fb, 0x02ca782f0756a8ea, 0x278384537a3e50a1, 0x106e13fee74ce0ab, },
-    { 0x233513af133e13b8, 0x1d13140d1c6f75f1, 0x12cde57f97e3eeda, 0x0d6149e23cbe654f, 0x36cd204f522a1379, },
-};
-
-static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = {
- 0x08a9ed097b425eea, 0x18a44aaaaaaaaab3, 0x2493f57f57f57f5d, 0x126b8d0649a7f8d4,
- 0x09d80ed7303b5ccc, 0x25b8bcf3cf3cf3d5, 0x2ce6ce63398ce638,
- 0x0e31fad40a57eb59, 0x02a3529fd4a7f52f,
- 0x3a5493e93e93e94a,
-};
-
-#elif LIMB_BITS == 32
-
-/* we must have: modulo >= 1 << NTT_MOD_LOG2_MIN */
-#define NTT_MOD_LOG2_MIN 29
-#define NTT_MOD_LOG2_MAX 30
-#define NB_MODS 5
-#define NTT_PROOT_2EXP 20
-static const int ntt_int_bits[NB_MODS] = { 148, 119, 89, 59, 29, };
-
-static const limb_t ntt_mods[NB_MODS] = { 0x0000000032b00001, 0x0000000033700001, 0x0000000036d00001, 0x0000000037300001, 0x000000003e500001,
-};
-
-static const limb_t ntt_proot[2][NB_MODS] = {
-    { 0x0000000032525f31, 0x0000000005eb3b37, 0x00000000246eda9f, 0x0000000035f25901, 0x00000000022f5768, },
-    { 0x00000000051eba1a, 0x00000000107be10e, 0x000000001cd574e0, 0x00000000053806e6, 0x000000002cd6bf98, },
-};
-
-static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = {
- 0x000000000449559a, 0x000000001eba6ca9, 0x000000002ec18e46, 0x000000000860160b,
- 0x000000000d321307, 0x000000000bf51120, 0x000000000f662938,
- 0x000000000932ab3e, 0x000000002f40eef8,
- 0x000000002e760905,
-};
-
-#endif /* LIMB_BITS */
-
-#endif /* !AVX2 */
-
-#if defined(__AVX2__)
-#define NTT_TRIG_K_MAX 18
-#else
-#define NTT_TRIG_K_MAX 19
-#endif
-
-typedef struct BFNTTState {
-    bf_context_t *ctx;
-
-    /* used for mul_mod_fast() */
-    limb_t ntt_mods_div[NB_MODS];
-
-    limb_t ntt_proot_pow[NB_MODS][2][NTT_PROOT_2EXP + 1];
-    limb_t ntt_proot_pow_inv[NB_MODS][2][NTT_PROOT_2EXP + 1];
-    NTTLimb *ntt_trig[NB_MODS][2][NTT_TRIG_K_MAX + 1];
-    /* 1/2^n mod m */
-    limb_t ntt_len_inv[NB_MODS][NTT_PROOT_2EXP + 1][2];
-#if defined(__AVX2__)
-    __m256d ntt_mods_cr_vec[NB_MODS * (NB_MODS - 1) / 2];
-    __m256d ntt_mods_vec[NB_MODS];
-    __m256d ntt_mods_inv_vec[NB_MODS];
-#else
-    limb_t ntt_mods_cr_inv[NB_MODS * (NB_MODS - 1) / 2];
-#endif
-} BFNTTState;
-
-static NTTLimb *get_trig(BFNTTState *s, int k, int inverse, int m_idx);
-
-/* add modulo with up to (LIMB_BITS-1) bit modulo */
-static inline limb_t add_mod(limb_t a, limb_t b, limb_t m)
-{
-    limb_t r;
-    r = a + b;
-    if (r >= m)
-        r -= m;
-    return r;
-}
-
-/* sub modulo with up to LIMB_BITS bit modulo */
-static inline limb_t sub_mod(limb_t a, limb_t b, limb_t m)
-{
-    limb_t r;
-    r = a - b;
-    if (r > a)
-        r += m;
-    return r;
-}
-
-/* return (r0+r1*B) mod m
-   precondition: 0 <= r0+r1*B < 2^(64+NTT_MOD_LOG2_MIN)
-*/
-static inline limb_t mod_fast(dlimb_t r,
-                                limb_t m, limb_t m_inv)
-{
-    limb_t a1, q, t0, r1, r0;
-
-    a1 = r >> NTT_MOD_LOG2_MIN;
-
-    q = ((dlimb_t)a1 * m_inv) >> LIMB_BITS;
-    r = r - (dlimb_t)q * m - m * 2;
-    r1 = r >> LIMB_BITS;
-    t0 = (slimb_t)r1 >> 1;
-    r += m & t0;
-    r0 = r;
-    r1 = r >> LIMB_BITS;
-    r0 += m & r1;
-    return r0;
-}
-
-/* faster version using precomputed modulo inverse.
-   precondition: 0 <= a * b < 2^(64+NTT_MOD_LOG2_MIN) */
-static inline limb_t mul_mod_fast(limb_t a, limb_t b,
-                                    limb_t m, limb_t m_inv)
-{
-    dlimb_t r;
-    r = (dlimb_t)a * (dlimb_t)b;
-    return mod_fast(r, m, m_inv);
-}
-
-static inline limb_t init_mul_mod_fast(limb_t m)
-{
-    dlimb_t t;
-    assert(m < (limb_t)1 << NTT_MOD_LOG2_MAX);
-    assert(m >= (limb_t)1 << NTT_MOD_LOG2_MIN);
-    t = (dlimb_t)1 << (LIMB_BITS + NTT_MOD_LOG2_MIN);
-    return t / m;
-}
-
-/* Faster version used when the multiplier is constant. 0 <= a < 2^64,
-   0 <= b < m. */
-static inline limb_t mul_mod_fast2(limb_t a, limb_t b,
-                                     limb_t m, limb_t b_inv)
-{
-    limb_t r, q;
-
-    q = ((dlimb_t)a * (dlimb_t)b_inv) >> LIMB_BITS;
-    r = a * b - q * m;
-    if (r >= m)
-        r -= m;
-    return r;
-}
-
-/* Faster version used when the multiplier is constant. 0 <= a < 2^64,
-   0 <= b < m. Let r = a * b mod m. The return value is 'r' or 'r +
-   m'. */
-static inline limb_t mul_mod_fast3(limb_t a, limb_t b,
-                                     limb_t m, limb_t b_inv)
-{
-    limb_t r, q;
-
-    q = ((dlimb_t)a * (dlimb_t)b_inv) >> LIMB_BITS;
-    r = a * b - q * m;
-    return r;
-}
-
-static inline limb_t init_mul_mod_fast2(limb_t b, limb_t m)
-{
-    return ((dlimb_t)b << LIMB_BITS) / m;
-}
-
-#ifdef __AVX2__
-
-static inline limb_t ntt_limb_to_int(NTTLimb a, limb_t m)
-{
-    slimb_t v;
-    v = a;
-    if (v < 0)
-        v += m;
-    if (v >= m)
-        v -= m;
-    return v;
-}
-
-static inline NTTLimb int_to_ntt_limb(limb_t a, limb_t m)
-{
-    return (slimb_t)a;
-}
-
-static inline NTTLimb int_to_ntt_limb2(limb_t a, limb_t m)
-{
-    if (a >= (m / 2))
-        a -= m;
-    return (slimb_t)a;
-}
-
-/* return r + m if r < 0 otherwise r. */
-static inline __m256d ntt_mod1(__m256d r, __m256d m)
-{
-    return _mm256_blendv_pd(r, r + m, r);
-}
-
-/* input: abs(r) < 2 * m. Output: abs(r) < m */
-static inline __m256d ntt_mod(__m256d r, __m256d mf, __m256d m2f)
-{
-    return _mm256_blendv_pd(r, r + m2f, r) - mf;
-}
-
-/* input: abs(a*b) < 2 * m^2, output: abs(r) < m */
-static inline __m256d ntt_mul_mod(__m256d a, __m256d b, __m256d mf,
-                                  __m256d m_inv)
-{
-    __m256d r, q, ab1, ab0, qm0, qm1;
-    ab1 = a * b;
-    q = _mm256_round_pd(ab1 * m_inv, 0); /* round to nearest */
-    qm1 = q * mf;
-    qm0 = _mm256_fmsub_pd(q, mf, qm1); /* low part */
-    ab0 = _mm256_fmsub_pd(a, b, ab1); /* low part */
-    r = (ab1 - qm1) + (ab0 - qm0);
-    return r;
-}
-
-static void *bf_aligned_malloc(bf_context_t *s, size_t size, size_t align)
-{
-    void *ptr;
-    void **ptr1;
-    ptr = bf_malloc(s, size + sizeof(void *) + align - 1);
-    if (!ptr)
-        return NULL;
-    ptr1 = (void **)(((uintptr_t)ptr + sizeof(void *) + align - 1) &
-                     ~(align - 1));
-    ptr1[-1] = ptr;
-    return ptr1;
-}
-
-static void bf_aligned_free(bf_context_t *s, void *ptr)
-{
-    if (!ptr)
-        return;
-    bf_free(s, ((void **)ptr)[-1]);
-}
-
-static void *ntt_malloc(BFNTTState *s, size_t size)
-{
-    return bf_aligned_malloc(s->ctx, size, 64);
-}
-
-static void ntt_free(BFNTTState *s, void *ptr)
-{
-    bf_aligned_free(s->ctx, ptr);
-}
-
-static no_inline int ntt_fft(BFNTTState *s,
-                             NTTLimb *out_buf, NTTLimb *in_buf,
-                             NTTLimb *tmp_buf, int fft_len_log2,
-                             int inverse, int m_idx)
-{
-    limb_t nb_blocks, fft_per_block, p, k, n, stride_in, i, j;
-    NTTLimb *tab_in, *tab_out, *tmp, *trig;
-    __m256d m_inv, mf, m2f, c, a0, a1, b0, b1;
-    limb_t m;
-    int l;
-
-    m = ntt_mods[m_idx];
-
-    m_inv = _mm256_set1_pd(1.0 / (double)m);
-    mf = _mm256_set1_pd(m);
-    m2f = _mm256_set1_pd(m * 2);
-
-    n = (limb_t)1 << fft_len_log2;
-    assert(n >= 8);
-    stride_in = n / 2;
-
-    tab_in = in_buf;
-    tab_out = tmp_buf;
-    trig = get_trig(s, fft_len_log2, inverse, m_idx);
-    if (!trig)
-        return -1;
-    p = 0;
-    for(k = 0; k < stride_in; k += 4) {
-        a0 = _mm256_load_pd(&tab_in[k]);
-        a1 = _mm256_load_pd(&tab_in[k + stride_in]);
-        c = _mm256_load_pd(trig);
-        trig += 4;
-        b0 = ntt_mod(a0 + a1, mf, m2f);
-        b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv);
-        a0 = _mm256_permute2f128_pd(b0, b1, 0x20);
-        a1 = _mm256_permute2f128_pd(b0, b1, 0x31);
-        a0 = _mm256_permute4x64_pd(a0, 0xd8);
-        a1 = _mm256_permute4x64_pd(a1, 0xd8);
-        _mm256_store_pd(&tab_out[p], a0);
-        _mm256_store_pd(&tab_out[p + 4], a1);
-        p += 2 * 4;
-    }
-    tmp = tab_in;
-    tab_in = tab_out;
-    tab_out = tmp;
-
-    trig = get_trig(s, fft_len_log2 - 1, inverse, m_idx);
-    if (!trig)
-        return -1;
-    p = 0;
-    for(k = 0; k < stride_in; k += 4) {
-        a0 = _mm256_load_pd(&tab_in[k]);
-        a1 = _mm256_load_pd(&tab_in[k + stride_in]);
-        c = _mm256_setr_pd(trig[0], trig[0], trig[1], trig[1]);
-        trig += 2;
-        b0 = ntt_mod(a0 + a1, mf, m2f);
-        b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv);
-        a0 = _mm256_permute2f128_pd(b0, b1, 0x20);
-        a1 = _mm256_permute2f128_pd(b0, b1, 0x31);
-        _mm256_store_pd(&tab_out[p], a0);
-        _mm256_store_pd(&tab_out[p + 4], a1);
-        p += 2 * 4;
-    }
-    tmp = tab_in;
-    tab_in = tab_out;
-    tab_out = tmp;
-
-    nb_blocks = n / 4;
-    fft_per_block = 4;
-
-    l = fft_len_log2 - 2;
-    while (nb_blocks != 2) {
-        nb_blocks >>= 1;
-        p = 0;
-        k = 0;
-        trig = get_trig(s, l, inverse, m_idx);
-        if (!trig)
-            return -1;
-        for(i = 0; i < nb_blocks; i++) {
-            c = _mm256_set1_pd(trig[0]);
-            trig++;
-            for(j = 0; j < fft_per_block; j += 4) {
-                a0 = _mm256_load_pd(&tab_in[k + j]);
-                a1 = _mm256_load_pd(&tab_in[k + j + stride_in]);
-                b0 = ntt_mod(a0 + a1, mf, m2f);
-                b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv);
-                _mm256_store_pd(&tab_out[p + j], b0);
-                _mm256_store_pd(&tab_out[p + j + fft_per_block], b1);
-            }
-            k += fft_per_block;
-            p += 2 * fft_per_block;
-        }
-        fft_per_block <<= 1;
-        l--;
-        tmp = tab_in;
-        tab_in = tab_out;
-        tab_out = tmp;
-    }
-
-    tab_out = out_buf;
-    for(k = 0; k < stride_in; k += 4) {
-        a0 = _mm256_load_pd(&tab_in[k]);
-        a1 = _mm256_load_pd(&tab_in[k + stride_in]);
-        b0 = ntt_mod(a0 + a1, mf, m2f);
-        b1 = ntt_mod(a0 - a1, mf, m2f);
-        _mm256_store_pd(&tab_out[k], b0);
-        _mm256_store_pd(&tab_out[k + stride_in], b1);
-    }
-    return 0;
-}
-
-static void ntt_vec_mul(BFNTTState *s,
-                        NTTLimb *tab1, NTTLimb *tab2, limb_t fft_len_log2,
-                        int k_tot, int m_idx)
-{
-    limb_t i, c_inv, n, m;
-    __m256d m_inv, mf, a, b, c;
-
-    m = ntt_mods[m_idx];
-    c_inv = s->ntt_len_inv[m_idx][k_tot][0];
-    m_inv = _mm256_set1_pd(1.0 / (double)m);
-    mf = _mm256_set1_pd(m);
-    c = _mm256_set1_pd(int_to_ntt_limb(c_inv, m));
-    n = (limb_t)1 << fft_len_log2;
-    for(i = 0; i < n; i += 4) {
-        a = _mm256_load_pd(&tab1[i]);
-        b = _mm256_load_pd(&tab2[i]);
-        a = ntt_mul_mod(a, b, mf, m_inv);
-        a = ntt_mul_mod(a, c, mf, m_inv);
-        _mm256_store_pd(&tab1[i], a);
-    }
-}
-
-static no_inline void mul_trig(NTTLimb *buf,
-                               limb_t n, limb_t c1, limb_t m, limb_t m_inv1)
-{
-    limb_t i, c2, c3, c4;
-    __m256d c, c_mul, a0, mf, m_inv;
-    assert(n >= 2);
-
-    mf = _mm256_set1_pd(m);
-    m_inv = _mm256_set1_pd(1.0 / (double)m);
-
-    c2 = mul_mod_fast(c1, c1, m, m_inv1);
-    c3 = mul_mod_fast(c2, c1, m, m_inv1);
-    c4 = mul_mod_fast(c2, c2, m, m_inv1);
-    c = _mm256_setr_pd(1, int_to_ntt_limb(c1, m),
-                       int_to_ntt_limb(c2, m), int_to_ntt_limb(c3, m));
-    c_mul = _mm256_set1_pd(int_to_ntt_limb(c4, m));
-    for(i = 0; i < n; i += 4) {
-        a0 = _mm256_load_pd(&buf[i]);
-        a0 = ntt_mul_mod(a0, c, mf, m_inv);
-        _mm256_store_pd(&buf[i], a0);
-        c = ntt_mul_mod(c, c_mul, mf, m_inv);
-    }
-}
-
-#else
-
-static void *ntt_malloc(BFNTTState *s, size_t size)
-{
-    return bf_malloc(s->ctx, size);
-}
-
-static void ntt_free(BFNTTState *s, void *ptr)
-{
-    bf_free(s->ctx, ptr);
-}
-
-static inline limb_t ntt_limb_to_int(NTTLimb a, limb_t m)
-{
-    if (a >= m)
-        a -= m;
-    return a;
-}
-
-static inline NTTLimb int_to_ntt_limb(slimb_t a, limb_t m)
-{
-    return a;
-}
-
-static no_inline int ntt_fft(BFNTTState *s, NTTLimb *out_buf, NTTLimb *in_buf,
-                             NTTLimb *tmp_buf, int fft_len_log2,
-                             int inverse, int m_idx)
-{
-    limb_t nb_blocks, fft_per_block, p, k, n, stride_in, i, j, m, m2;
-    NTTLimb *tab_in, *tab_out, *tmp, a0, a1, b0, b1, c, *trig, c_inv;
-    int l;
-
-    m = ntt_mods[m_idx];
-    m2 = 2 * m;
-    n = (limb_t)1 << fft_len_log2;
-    nb_blocks = n;
-    fft_per_block = 1;
-    stride_in = n / 2;
-    tab_in = in_buf;
-    tab_out = tmp_buf;
-    l = fft_len_log2;
-    while (nb_blocks != 2) {
-        nb_blocks >>= 1;
-        p = 0;
-        k = 0;
-        trig = get_trig(s, l, inverse, m_idx);
-        if (!trig)
-            return -1;
-        for(i = 0; i < nb_blocks; i++) {
-            c = trig[0];
-            c_inv = trig[1];
-            trig += 2;
-            for(j = 0; j < fft_per_block; j++) {
-                a0 = tab_in[k + j];
-                a1 = tab_in[k + j + stride_in];
-                b0 = add_mod(a0, a1, m2);
-                b1 = a0 - a1 + m2;
-                b1 = mul_mod_fast3(b1, c, m, c_inv);
-                tab_out[p + j] = b0;
-                tab_out[p + j + fft_per_block] = b1;
-            }
-            k += fft_per_block;
-            p += 2 * fft_per_block;
-        }
-        fft_per_block <<= 1;
-        l--;
-        tmp = tab_in;
-        tab_in = tab_out;
-        tab_out = tmp;
-    }
-    /* no twiddle in last step */
-    tab_out = out_buf;
-    for(k = 0; k < stride_in; k++) {
-        a0 = tab_in[k];
-        a1 = tab_in[k + stride_in];
-        b0 = add_mod(a0, a1, m2);
-        b1 = sub_mod(a0, a1, m2);
-        tab_out[k] = b0;
-        tab_out[k + stride_in] = b1;
-    }
-    return 0;
-}
-
-static void ntt_vec_mul(BFNTTState *s,
-                        NTTLimb *tab1, NTTLimb *tab2, int fft_len_log2,
-                        int k_tot, int m_idx)
-{
-    limb_t i, norm, norm_inv, a, n, m, m_inv;
-
-    m = ntt_mods[m_idx];
-    m_inv = s->ntt_mods_div[m_idx];
-    norm = s->ntt_len_inv[m_idx][k_tot][0];
-    norm_inv = s->ntt_len_inv[m_idx][k_tot][1];
-    n = (limb_t)1 << fft_len_log2;
-    for(i = 0; i < n; i++) {
-        a = tab1[i];
-        /* need to reduce the range so that the product is <
-           2^(LIMB_BITS+NTT_MOD_LOG2_MIN) */
-        if (a >= m)
-            a -= m;
-        a = mul_mod_fast(a, tab2[i], m, m_inv);
-        a = mul_mod_fast3(a, norm, m, norm_inv);
-        tab1[i] = a;
-    }
-}
-
-static no_inline void mul_trig(NTTLimb *buf,
-                               limb_t n, limb_t c_mul, limb_t m, limb_t m_inv)
-{
-    limb_t i, c0, c_mul_inv;
-
-    c0 = 1;
-    c_mul_inv = init_mul_mod_fast2(c_mul, m);
-    for(i = 0; i < n; i++) {
-        buf[i] = mul_mod_fast(buf[i], c0, m, m_inv);
-        c0 = mul_mod_fast2(c0, c_mul, m, c_mul_inv);
-    }
-}
-
-#endif /* !AVX2 */
-
-static no_inline NTTLimb *get_trig(BFNTTState *s,
-                                   int k, int inverse, int m_idx)
-{
-    NTTLimb *tab;
-    limb_t i, n2, c, c_mul, m, c_mul_inv;
-
-    if (k > NTT_TRIG_K_MAX)
-        return NULL;
-
-    tab = s->ntt_trig[m_idx][inverse][k];
-    if (tab)
-        return tab;
-    n2 = (limb_t)1 << (k - 1);
-    m = ntt_mods[m_idx];
-#ifdef __AVX2__
-    tab = ntt_malloc(s, sizeof(NTTLimb) * n2);
-#else
-    tab = ntt_malloc(s, sizeof(NTTLimb) * n2 * 2);
-#endif
-    if (!tab)
-        return NULL;
-    c = 1;
-    c_mul = s->ntt_proot_pow[m_idx][inverse][k];
-    c_mul_inv = s->ntt_proot_pow_inv[m_idx][inverse][k];
-    for(i = 0; i < n2; i++) {
-#ifdef __AVX2__
-        tab[i] = int_to_ntt_limb2(c, m);
-#else
-        tab[2 * i] = int_to_ntt_limb(c, m);
-        tab[2 * i + 1] = init_mul_mod_fast2(c, m);
-#endif
-        c = mul_mod_fast2(c, c_mul, m, c_mul_inv);
-    }
-    s->ntt_trig[m_idx][inverse][k] = tab;
-    return tab;
-}
-
-void fft_clear_cache(bf_context_t *s1)
-{
-    int m_idx, inverse, k;
-    BFNTTState *s = s1->ntt_state;
-    if (s) {
-        for(m_idx = 0; m_idx < NB_MODS; m_idx++) {
-            for(inverse = 0; inverse < 2; inverse++) {
-                for(k = 0; k < NTT_TRIG_K_MAX + 1; k++) {
-                    if (s->ntt_trig[m_idx][inverse][k]) {
-                        ntt_free(s, s->ntt_trig[m_idx][inverse][k]);
-                        s->ntt_trig[m_idx][inverse][k] = NULL;
-                    }
-                }
-            }
-        }
-#if defined(__AVX2__)
-        bf_aligned_free(s1, s);
-#else
-        bf_free(s1, s);
-#endif
-        s1->ntt_state = NULL;
-    }
-}
-
-#define STRIP_LEN 16
-
-/* dst = buf1, src = buf2 */
-static int ntt_fft_partial(BFNTTState *s, NTTLimb *buf1,
-                           int k1, int k2, limb_t n1, limb_t n2, int inverse,
-                           limb_t m_idx)
-{
-    limb_t i, j, c_mul, c0, m, m_inv, strip_len, l;
-    NTTLimb *buf2, *buf3;
-
-    buf2 = NULL;
-    buf3 = ntt_malloc(s, sizeof(NTTLimb) * n1);
-    if (!buf3)
-        goto fail;
-    if (k2 == 0) {
-        if (ntt_fft(s, buf1, buf1, buf3, k1, inverse, m_idx))
-            goto fail;
-    } else {
-        strip_len = STRIP_LEN;
-        buf2 = ntt_malloc(s, sizeof(NTTLimb) * n1 * strip_len);
-        if (!buf2)
-            goto fail;
-        m = ntt_mods[m_idx];
-        m_inv = s->ntt_mods_div[m_idx];
-        c0 = s->ntt_proot_pow[m_idx][inverse][k1 + k2];
-        c_mul = 1;
-        assert((n2 % strip_len) == 0);
-        for(j = 0; j < n2; j += strip_len) {
-            for(i = 0; i < n1; i++) {
-                for(l = 0; l < strip_len; l++) {
-                    buf2[i + l * n1] = buf1[i * n2 + (j + l)];
-                }
-            }
-            for(l = 0; l < strip_len; l++) {
-                if (inverse)
-                    mul_trig(buf2 + l * n1, n1, c_mul, m, m_inv);
-                if (ntt_fft(s, buf2 + l * n1, buf2 + l * n1, buf3, k1, inverse, m_idx))
-                    goto fail;
-                if (!inverse)
-                    mul_trig(buf2 + l * n1, n1, c_mul, m, m_inv);
-                c_mul = mul_mod_fast(c_mul, c0, m, m_inv);
-            }
-
-            for(i = 0; i < n1; i++) {
-                for(l = 0; l < strip_len; l++) {
-                    buf1[i * n2 + (j + l)] = buf2[i + l *n1];
-                }
-            }
-        }
-        ntt_free(s, buf2);
-    }
-    ntt_free(s, buf3);
-    return 0;
- fail:
-    ntt_free(s, buf2);
-    ntt_free(s, buf3);
-    return -1;
-}
-
-
-/* dst = buf1, src = buf2, tmp = buf3 */
-static int ntt_conv(BFNTTState *s, NTTLimb *buf1, NTTLimb *buf2,
-                    int k, int k_tot, limb_t m_idx)
-{
-    limb_t n1, n2, i;
-    int k1, k2;
-
-    if (k <= NTT_TRIG_K_MAX) {
-        k1 = k;
-    } else {
-        /* recursive split of the FFT */
-        k1 = bf_min(k / 2, NTT_TRIG_K_MAX);
-    }
-    k2 = k - k1;
-    n1 = (limb_t)1 << k1;
-    n2 = (limb_t)1 << k2;
-
-    if (ntt_fft_partial(s, buf1, k1, k2, n1, n2, 0, m_idx))
-        return -1;
-    if (ntt_fft_partial(s, buf2, k1, k2, n1, n2, 0, m_idx))
-        return -1;
-    if (k2 == 0) {
-        ntt_vec_mul(s, buf1, buf2, k, k_tot, m_idx);
-    } else {
-        for(i = 0; i < n1; i++) {
-            ntt_conv(s, buf1 + i * n2, buf2 + i * n2, k2, k_tot, m_idx);
-        }
-    }
-    if (ntt_fft_partial(s, buf1, k1, k2, n1, n2, 1, m_idx))
-        return -1;
-    return 0;
-}
-
-
-static no_inline void limb_to_ntt(BFNTTState *s,
-                                  NTTLimb *tabr, limb_t fft_len,
-                                  const limb_t *taba, limb_t a_len, int dpl,
-                                  int first_m_idx, int nb_mods)
-{
-    slimb_t i, n;
-    dlimb_t a, b;
-    int j, shift;
-    limb_t base_mask1, a0, a1, a2, r, m, m_inv;
-
-    memset(tabr, 0, sizeof(NTTLimb) * fft_len * nb_mods);
-    shift = dpl & (LIMB_BITS - 1);
-    if (shift == 0)
-        base_mask1 = -1;
-    else
-        base_mask1 = ((limb_t)1 << shift) - 1;
-    n = bf_min(fft_len, (a_len * LIMB_BITS + dpl - 1) / dpl);
-    for(i = 0; i < n; i++) {
-        a0 = get_bits(taba, a_len, i * dpl);
-        if (dpl <= LIMB_BITS) {
-            a0 &= base_mask1;
-            a = a0;
-        } else {
-            a1 = get_bits(taba, a_len, i * dpl + LIMB_BITS);
-            if (dpl <= (LIMB_BITS + NTT_MOD_LOG2_MIN)) {
-                a = a0 | ((dlimb_t)(a1 & base_mask1) << LIMB_BITS);
-            } else {
-                if (dpl > 2 * LIMB_BITS) {
-                    a2 = get_bits(taba, a_len, i * dpl + LIMB_BITS * 2) &
-                        base_mask1;
-                } else {
-                    a1 &= base_mask1;
-                    a2 = 0;
-                }
-                //            printf("a=0x%016lx%016lx%016lx\n", a2, a1, a0);
-                a = (a0 >> (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) |
-                    ((dlimb_t)a1 << (NTT_MOD_LOG2_MAX - NTT_MOD_LOG2_MIN)) |
-                    ((dlimb_t)a2 << (LIMB_BITS + NTT_MOD_LOG2_MAX - NTT_MOD_LOG2_MIN));
-                a0 &= ((limb_t)1 << (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) - 1;
-            }
-        }
-        for(j = 0; j < nb_mods; j++) {
-            m = ntt_mods[first_m_idx + j];
-            m_inv = s->ntt_mods_div[first_m_idx + j];
-            r = mod_fast(a, m, m_inv);
-            if (dpl > (LIMB_BITS + NTT_MOD_LOG2_MIN)) {
-                b = ((dlimb_t)r << (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) | a0;
-                r = mod_fast(b, m, m_inv);
-            }
-            tabr[i + j * fft_len] = int_to_ntt_limb(r, m);
-        }
-    }
-}
-
-#if defined(__AVX2__)
-
-#define VEC_LEN 4
-
-typedef union {
-    __m256d v;
-    double d[4];
-} VecUnion;
-
-static no_inline void ntt_to_limb(BFNTTState *s, limb_t *tabr, limb_t r_len,
-                                  const NTTLimb *buf, int fft_len_log2, int dpl,
-                                  int nb_mods)
-{
-    const limb_t *mods = ntt_mods + NB_MODS - nb_mods;
-    const __m256d *mods_cr_vec, *mf, *m_inv;
-    VecUnion y[NB_MODS];
-    limb_t u[NB_MODS], carry[NB_MODS], fft_len, base_mask1, r;
-    slimb_t i, len, pos;
-    int j, k, l, shift, n_limb1, p;
-    dlimb_t t;
-
-    j = NB_MODS * (NB_MODS - 1) / 2 - nb_mods * (nb_mods - 1) / 2;
-    mods_cr_vec = s->ntt_mods_cr_vec + j;
-    mf = s->ntt_mods_vec + NB_MODS - nb_mods;
-    m_inv = s->ntt_mods_inv_vec + NB_MODS - nb_mods;
-
-    shift = dpl & (LIMB_BITS - 1);
-    if (shift == 0)
-        base_mask1 = -1;
-    else
-        base_mask1 = ((limb_t)1 << shift) - 1;
-    n_limb1 = ((unsigned)dpl - 1) / LIMB_BITS;
-    for(j = 0; j < NB_MODS; j++)
-        carry[j] = 0;
-    for(j = 0; j < NB_MODS; j++)
-        u[j] = 0; /* avoid warnings */
-    memset(tabr, 0, sizeof(limb_t) * r_len);
-    fft_len = (limb_t)1 << fft_len_log2;
-    len = bf_min(fft_len, (r_len * LIMB_BITS + dpl - 1) / dpl);
-    len = (len + VEC_LEN - 1) & ~(VEC_LEN - 1);
-    i = 0;
-    while (i < len) {
-        for(j = 0; j < nb_mods; j++)
-            y[j].v = *(__m256d *)&buf[i + fft_len * j];
-
-        /* Chinese remainder to get mixed radix representation */
-        l = 0;
-        for(j = 0; j < nb_mods - 1; j++) {
-            y[j].v = ntt_mod1(y[j].v, mf[j]);
-            for(k = j + 1; k < nb_mods; k++) {
-                y[k].v = ntt_mul_mod(y[k].v - y[j].v,
-                                     mods_cr_vec[l], mf[k], m_inv[k]);
-                l++;
-            }
-        }
-        y[j].v = ntt_mod1(y[j].v, mf[j]);
-
-        for(p = 0; p < VEC_LEN; p++) {
-            /* back to normal representation */
-            u[0] = (int64_t)y[nb_mods - 1].d[p];
-            l = 1;
-            for(j = nb_mods - 2; j >= 1; j--) {
-                r = (int64_t)y[j].d[p];
-                for(k = 0; k < l; k++) {
-                    t = (dlimb_t)u[k] * mods[j] + r;
-                    r = t >> LIMB_BITS;
-                    u[k] = t;
-                }
-                u[l] = r;
-                l++;
-            }
-            /* XXX: for nb_mods = 5, l should be 4 */
-
-            /* last step adds the carry */
-            r = (int64_t)y[0].d[p];
-            for(k = 0; k < l; k++) {
-                t = (dlimb_t)u[k] * mods[j] + r + carry[k];
-                r = t >> LIMB_BITS;
-                u[k] = t;
-            }
-            u[l] = r + carry[l];
-
-            /* write the digits */
-            pos = i * dpl;
-            for(j = 0; j < n_limb1; j++) {
-                put_bits(tabr, r_len, pos, u[j]);
-                pos += LIMB_BITS;
-            }
-            put_bits(tabr, r_len, pos, u[n_limb1] & base_mask1);
-            /* shift by dpl digits and set the carry */
-            if (shift == 0) {
-                for(j = n_limb1 + 1; j < nb_mods; j++)
-                    carry[j - (n_limb1 + 1)] = u[j];
-            } else {
-                for(j = n_limb1; j < nb_mods - 1; j++) {
-                    carry[j - n_limb1] = (u[j] >> shift) |
-                        (u[j + 1] << (LIMB_BITS - shift));
-                }
-                carry[nb_mods - 1 - n_limb1] = u[nb_mods - 1] >> shift;
-            }
-            i++;
-        }
-    }
-}
-#else
-static no_inline void ntt_to_limb(BFNTTState *s, limb_t *tabr, limb_t r_len,
-                                  const NTTLimb *buf, int fft_len_log2, int dpl,
-                                  int nb_mods)
-{
-    const limb_t *mods = ntt_mods + NB_MODS - nb_mods;
-    const limb_t *mods_cr, *mods_cr_inv;
-    limb_t y[NB_MODS], u[NB_MODS], carry[NB_MODS], fft_len, base_mask1, r;
-    slimb_t i, len, pos;
-    int j, k, l, shift, n_limb1;
-    dlimb_t t;
-
-    j = NB_MODS * (NB_MODS - 1) / 2 - nb_mods * (nb_mods - 1) / 2;
-    mods_cr = ntt_mods_cr + j;
-    mods_cr_inv = s->ntt_mods_cr_inv + j;
-
-    shift = dpl & (LIMB_BITS - 1);
-    if (shift == 0)
-        base_mask1 = -1;
-    else
-        base_mask1 = ((limb_t)1 << shift) - 1;
-    n_limb1 = ((unsigned)dpl - 1) / LIMB_BITS;
-    for(j = 0; j < NB_MODS; j++)
-        carry[j] = 0;
-    for(j = 0; j < NB_MODS; j++)
-        u[j] = 0; /* avoid warnings */
-    memset(tabr, 0, sizeof(limb_t) * r_len);
-    fft_len = (limb_t)1 << fft_len_log2;
-    len = bf_min(fft_len, (r_len * LIMB_BITS + dpl - 1) / dpl);
-    for(i = 0; i < len; i++) {
-        for(j = 0; j < nb_mods; j++)  {
-            y[j] = ntt_limb_to_int(buf[i + fft_len * j], mods[j]);
-        }
-
-        /* Chinese remainder to get mixed radix representation */
-        l = 0;
-        for(j = 0; j < nb_mods - 1; j++) {
-            for(k = j + 1; k < nb_mods; k++) {
-                limb_t m;
-                m = mods[k];
-                /* Note: there is no overflow in the sub_mod() because
-                   the modulos are sorted by increasing order */
-                y[k] = mul_mod_fast2(y[k] - y[j] + m,
-                                     mods_cr[l], m, mods_cr_inv[l]);
-                l++;
-            }
-        }
-
-        /* back to normal representation */
-        u[0] = y[nb_mods - 1];
-        l = 1;
-        for(j = nb_mods - 2; j >= 1; j--) {
-            r = y[j];
-            for(k = 0; k < l; k++) {
-                t = (dlimb_t)u[k] * mods[j] + r;
-                r = t >> LIMB_BITS;
-                u[k] = t;
-            }
-            u[l] = r;
-            l++;
-        }
-
-        /* last step adds the carry */
-        r = y[0];
-        for(k = 0; k < l; k++) {
-            t = (dlimb_t)u[k] * mods[j] + r + carry[k];
-            r = t >> LIMB_BITS;
-            u[k] = t;
-        }
-        u[l] = r + carry[l];
-
-        /* write the digits */
-        pos = i * dpl;
-        for(j = 0; j < n_limb1; j++) {
-            put_bits(tabr, r_len, pos, u[j]);
-            pos += LIMB_BITS;
-        }
-        put_bits(tabr, r_len, pos, u[n_limb1] & base_mask1);
-        /* shift by dpl digits and set the carry */
-        if (shift == 0) {
-            for(j = n_limb1 + 1; j < nb_mods; j++)
-                carry[j - (n_limb1 + 1)] = u[j];
-        } else {
-            for(j = n_limb1; j < nb_mods - 1; j++) {
-                carry[j - n_limb1] = (u[j] >> shift) |
-                    (u[j + 1] << (LIMB_BITS - shift));
-            }
-            carry[nb_mods - 1 - n_limb1] = u[nb_mods - 1] >> shift;
-        }
-    }
-}
-#endif
-
-static int ntt_static_init(bf_context_t *s1)
-{
-    BFNTTState *s;
-    int inverse, i, j, k, l;
-    limb_t c, c_inv, c_inv2, m, m_inv;
-
-    if (s1->ntt_state)
-        return 0;
-#if defined(__AVX2__)
-    s = bf_aligned_malloc(s1, sizeof(*s), 64);
-#else
-    s = bf_malloc(s1, sizeof(*s));
-#endif
-    if (!s)
-        return -1;
-    memset(s, 0, sizeof(*s));
-    s1->ntt_state = s;
-    s->ctx = s1;
-
-    for(j = 0; j < NB_MODS; j++) {
-        m = ntt_mods[j];
-        m_inv = init_mul_mod_fast(m);
-        s->ntt_mods_div[j] = m_inv;
-#if defined(__AVX2__)
-        s->ntt_mods_vec[j] = _mm256_set1_pd(m);
-        s->ntt_mods_inv_vec[j] = _mm256_set1_pd(1.0 / (double)m);
-#endif
-        c_inv2 = (m + 1) / 2; /* 1/2 */
-        c_inv = 1;
-        for(i = 0; i <= NTT_PROOT_2EXP; i++) {
-            s->ntt_len_inv[j][i][0] = c_inv;
-            s->ntt_len_inv[j][i][1] = init_mul_mod_fast2(c_inv, m);
-            c_inv = mul_mod_fast(c_inv, c_inv2, m, m_inv);
-        }
-
-        for(inverse = 0; inverse < 2; inverse++) {
-            c = ntt_proot[inverse][j];
-            for(i = 0; i < NTT_PROOT_2EXP; i++) {
-                s->ntt_proot_pow[j][inverse][NTT_PROOT_2EXP - i] = c;
-                s->ntt_proot_pow_inv[j][inverse][NTT_PROOT_2EXP - i] =
-                    init_mul_mod_fast2(c, m);
-                c = mul_mod_fast(c, c, m, m_inv);
-            }
-        }
-    }
-
-    l = 0;
-    for(j = 0; j < NB_MODS - 1; j++) {
-        for(k = j + 1; k < NB_MODS; k++) {
-#if defined(__AVX2__)
-            s->ntt_mods_cr_vec[l] = _mm256_set1_pd(int_to_ntt_limb2(ntt_mods_cr[l],
-                                                                    ntt_mods[k]));
-#else
-            s->ntt_mods_cr_inv[l] = init_mul_mod_fast2(ntt_mods_cr[l],
-                                                       ntt_mods[k]);
-#endif
-            l++;
-        }
-    }
-    return 0;
-}
-
-int bf_get_fft_size(int *pdpl, int *pnb_mods, limb_t len)
-{
-    int dpl, fft_len_log2, n_bits, nb_mods, dpl_found, fft_len_log2_found;
-    int int_bits, nb_mods_found;
-    limb_t cost, min_cost;
-
-    min_cost = -1;
-    dpl_found = 0;
-    nb_mods_found = 4;
-    fft_len_log2_found = 0;
-    for(nb_mods = 3; nb_mods <= NB_MODS; nb_mods++) {
-        int_bits = ntt_int_bits[NB_MODS - nb_mods];
-        dpl = bf_min((int_bits - 4) / 2,
-                     2 * LIMB_BITS + 2 * NTT_MOD_LOG2_MIN - NTT_MOD_LOG2_MAX);
-        for(;;) {
-            fft_len_log2 = ceil_log2((len * LIMB_BITS + dpl - 1) / dpl);
-            if (fft_len_log2 > NTT_PROOT_2EXP)
-                goto next;
-            n_bits = fft_len_log2 + 2 * dpl;
-            if (n_bits <= int_bits) {
-                cost = ((limb_t)(fft_len_log2 + 1) << fft_len_log2) * nb_mods;
-                //                printf("n=%d dpl=%d: cost=%" PRId64 "\n", nb_mods, dpl, (int64_t)cost);
-                if (cost < min_cost) {
-                    min_cost = cost;
-                    dpl_found = dpl;
-                    nb_mods_found = nb_mods;
-                    fft_len_log2_found = fft_len_log2;
-                }
-                break;
-            }
-            dpl--;
-            if (dpl == 0)
-                break;
-        }
-    next: ;
-    }
-    if (!dpl_found)
-        abort();
-    /* limit dpl if possible to reduce fixed cost of limb/NTT conversion */
-    if (dpl_found > (LIMB_BITS + NTT_MOD_LOG2_MIN) &&
-        ((limb_t)(LIMB_BITS + NTT_MOD_LOG2_MIN) << fft_len_log2_found) >=
-        len * LIMB_BITS) {
-        dpl_found = LIMB_BITS + NTT_MOD_LOG2_MIN;
-    }
-    *pnb_mods = nb_mods_found;
-    *pdpl = dpl_found;
-    return fft_len_log2_found;
-}
-
-/* return 0 if OK, -1 if memory error */
-static no_inline int fft_mul(bf_context_t *s1,
-                             bf_t *res, limb_t *a_tab, limb_t a_len,
-                             limb_t *b_tab, limb_t b_len, int mul_flags)
-{
-    BFNTTState *s;
-    int dpl, fft_len_log2, j, nb_mods, reduced_mem;
-    slimb_t len, fft_len;
-    NTTLimb *buf1, *buf2, *ptr;
-#if defined(USE_MUL_CHECK)
-    limb_t ha, hb, hr, h_ref;
-#endif
-
-    if (ntt_static_init(s1))
-        return -1;
-    s = s1->ntt_state;
-
-    /* find the optimal number of digits per limb (dpl) */
-    len = a_len + b_len;
-    fft_len_log2 = bf_get_fft_size(&dpl, &nb_mods, len);
-    fft_len = (uint64_t)1 << fft_len_log2;
-    //    printf("len=%" PRId64 " fft_len_log2=%d dpl=%d\n", len, fft_len_log2, dpl);
-#if defined(USE_MUL_CHECK)
-    ha = mp_mod1(a_tab, a_len, BF_CHKSUM_MOD, 0);
-    hb = mp_mod1(b_tab, b_len, BF_CHKSUM_MOD, 0);
-#endif
-    if ((mul_flags & (FFT_MUL_R_OVERLAP_A | FFT_MUL_R_OVERLAP_B)) == 0) {
-        if (!(mul_flags & FFT_MUL_R_NORESIZE))
-            bf_resize(res, 0);
-    } else if (mul_flags & FFT_MUL_R_OVERLAP_B) {
-        limb_t *tmp_tab, tmp_len;
-        /* it is better to free 'b' first */
-        tmp_tab = a_tab;
-        a_tab = b_tab;
-        b_tab = tmp_tab;
-        tmp_len = a_len;
-        a_len = b_len;
-        b_len = tmp_len;
-    }
-    buf1 = ntt_malloc(s, sizeof(NTTLimb) * fft_len * nb_mods);
-    if (!buf1)
-        return -1;
-    limb_to_ntt(s, buf1, fft_len, a_tab, a_len, dpl,
-                NB_MODS - nb_mods, nb_mods);
-    if ((mul_flags & (FFT_MUL_R_OVERLAP_A | FFT_MUL_R_OVERLAP_B)) ==
-        FFT_MUL_R_OVERLAP_A) {
-        if (!(mul_flags & FFT_MUL_R_NORESIZE))
-            bf_resize(res, 0);
-    }
-    reduced_mem = (fft_len_log2 >= 14);
-    if (!reduced_mem) {
-        buf2 = ntt_malloc(s, sizeof(NTTLimb) * fft_len * nb_mods);
-        if (!buf2)
-            goto fail;
-        limb_to_ntt(s, buf2, fft_len, b_tab, b_len, dpl,
-                    NB_MODS - nb_mods, nb_mods);
-        if (!(mul_flags & FFT_MUL_R_NORESIZE))
-            bf_resize(res, 0); /* in case res == b */
-    } else {
-        buf2 = ntt_malloc(s, sizeof(NTTLimb) * fft_len);
-        if (!buf2)
-            goto fail;
-    }
-    for(j = 0; j < nb_mods; j++) {
-        if (reduced_mem) {
-            limb_to_ntt(s, buf2, fft_len, b_tab, b_len, dpl,
-                        NB_MODS - nb_mods + j, 1);
-            ptr = buf2;
-        } else {
-            ptr = buf2 + fft_len * j;
-        }
-        if (ntt_conv(s, buf1 + fft_len * j, ptr,
-                     fft_len_log2, fft_len_log2, j + NB_MODS - nb_mods))
-            goto fail;
-    }
-    if (!(mul_flags & FFT_MUL_R_NORESIZE))
-        bf_resize(res, 0); /* in case res == b and reduced mem */
-    ntt_free(s, buf2);
-    buf2 = NULL;
-    if (!(mul_flags & FFT_MUL_R_NORESIZE)) {
-        if (bf_resize(res, len))
-            goto fail;
-    }
-    ntt_to_limb(s, res->tab, len, buf1, fft_len_log2, dpl, nb_mods);
-    ntt_free(s, buf1);
-#if defined(USE_MUL_CHECK)
-    hr = mp_mod1(res->tab, len, BF_CHKSUM_MOD, 0);
-    h_ref = mul_mod(ha, hb, BF_CHKSUM_MOD);
-    if (hr != h_ref) {
-        printf("ntt_mul_error: len=%" PRId_LIMB " fft_len_log2=%d dpl=%d nb_mods=%d\n",
-               len, fft_len_log2, dpl, nb_mods);
-        //        printf("ha=0x" FMT_LIMB" hb=0x" FMT_LIMB " hr=0x" FMT_LIMB " expected=0x" FMT_LIMB "\n", ha, hb, hr, h_ref);
-        exit(1);
-    }
-#endif
-    return 0;
- fail:
-    ntt_free(s, buf1);
-    ntt_free(s, buf2);
-    return -1;
-}
-
-#else /* USE_FFT_MUL */
-
-int bf_get_fft_size(int *pdpl, int *pnb_mods, limb_t len)
-{
-    return 0;
-}
-
-#endif /* !USE_FFT_MUL */
-
-#undef malloc
-#undef free
-#undef realloc
diff --git a/lib/monoucha0/monoucha/qjs/libbf.h b/lib/monoucha0/monoucha/qjs/libbf.h
deleted file mode 100644
index 3586532e..00000000
--- a/lib/monoucha0/monoucha/qjs/libbf.h
+++ /dev/null
@@ -1,545 +0,0 @@
-/*
- * Tiny arbitrary precision floating point library
- *
- * Copyright (c) 2017-2021 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef LIBBF_H
-#define LIBBF_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if INTPTR_MAX >= INT64_MAX && !defined(_WIN32) && !defined(__TINYC__)
-#define LIMB_LOG2_BITS 6
-#else
-#define LIMB_LOG2_BITS 5
-#endif
-
-#define LIMB_BITS (1 << LIMB_LOG2_BITS)
-
-#if LIMB_BITS == 64
-#ifndef INT128_MAX
-__extension__ typedef __int128 int128_t;
-__extension__ typedef unsigned __int128 uint128_t;
-#endif
-typedef int64_t slimb_t;
-typedef uint64_t limb_t;
-typedef uint128_t dlimb_t;
-#define BF_RAW_EXP_MIN INT64_MIN
-#define BF_RAW_EXP_MAX INT64_MAX
-
-#define LIMB_DIGITS 19
-#define BF_DEC_BASE UINT64_C(10000000000000000000)
-
-#else
-
-typedef int32_t slimb_t;
-typedef uint32_t limb_t;
-typedef uint64_t dlimb_t;
-#define BF_RAW_EXP_MIN INT32_MIN
-#define BF_RAW_EXP_MAX INT32_MAX
-
-#define LIMB_DIGITS 9
-#define BF_DEC_BASE 1000000000U
-
-#endif
-
-/* in bits */
-/* minimum number of bits for the exponent */
-#define BF_EXP_BITS_MIN 3
-/* maximum number of bits for the exponent */
-#define BF_EXP_BITS_MAX (LIMB_BITS - 3)
-/* extended range for exponent, used internally */
-#define BF_EXT_EXP_BITS_MAX (BF_EXP_BITS_MAX + 1)
-/* minimum possible precision */
-#define BF_PREC_MIN 2
-/* minimum possible precision */
-#define BF_PREC_MAX (((limb_t)1 << (LIMB_BITS - 2)) - 2)
-/* some operations support infinite precision */
-#define BF_PREC_INF (BF_PREC_MAX + 1) /* infinite precision */
-
-#if LIMB_BITS == 64
-#define BF_CHKSUM_MOD (UINT64_C(975620677) * UINT64_C(9795002197))
-#else
-#define BF_CHKSUM_MOD 975620677U
-#endif
-
-#define BF_EXP_ZERO BF_RAW_EXP_MIN
-#define BF_EXP_INF (BF_RAW_EXP_MAX - 1)
-#define BF_EXP_NAN BF_RAW_EXP_MAX
-
-/* +/-zero is represented with expn = BF_EXP_ZERO and len = 0,
-   +/-infinity is represented with expn = BF_EXP_INF and len = 0,
-   NaN is represented with expn = BF_EXP_NAN and len = 0 (sign is ignored)
- */
-typedef struct {
-    struct bf_context_t *ctx;
-    int sign;
-    slimb_t expn;
-    limb_t len;
-    limb_t *tab;
-} bf_t;
-
-typedef struct {
-    /* must be kept identical to bf_t */
-    struct bf_context_t *ctx;
-    int sign;
-    slimb_t expn;
-    limb_t len;
-    limb_t *tab;
-} bfdec_t;
-
-typedef enum {
-    BF_RNDN, /* round to nearest, ties to even */
-    BF_RNDZ, /* round to zero */
-    BF_RNDD, /* round to -inf (the code relies on (BF_RNDD xor BF_RNDU) = 1) */
-    BF_RNDU, /* round to +inf */
-    BF_RNDNA, /* round to nearest, ties away from zero */
-    BF_RNDA, /* round away from zero */
-    BF_RNDF, /* faithful rounding (nondeterministic, either RNDD or RNDU,
-                inexact flag is always set)  */
-} bf_rnd_t;
-
-/* allow subnormal numbers. Only available if the number of exponent
-   bits is <= BF_EXP_BITS_USER_MAX and prec != BF_PREC_INF. */
-#define BF_FLAG_SUBNORMAL (1 << 3)
-/* 'prec' is the precision after the radix point instead of the whole
-   mantissa. Can only be used with bf_round() and
-   bfdec_[add|sub|mul|div|sqrt|round](). */
-#define BF_FLAG_RADPNT_PREC (1 << 4)
-
-#define BF_RND_MASK 0x7
-#define BF_EXP_BITS_SHIFT 5
-#define BF_EXP_BITS_MASK 0x3f
-
-/* shortcut for bf_set_exp_bits(BF_EXT_EXP_BITS_MAX) */
-#define BF_FLAG_EXT_EXP (BF_EXP_BITS_MASK << BF_EXP_BITS_SHIFT)
-
-/* contains the rounding mode and number of exponents bits */
-typedef uint32_t bf_flags_t;
-
-typedef void *bf_realloc_func_t(void *opaque, void *ptr, size_t size);
-
-typedef struct {
-    bf_t val;
-    limb_t prec;
-} BFConstCache;
-
-typedef struct bf_context_t {
-    void *realloc_opaque;
-    bf_realloc_func_t *realloc_func;
-    BFConstCache log2_cache;
-    BFConstCache pi_cache;
-    struct BFNTTState *ntt_state;
-} bf_context_t;
-
-static inline int bf_get_exp_bits(bf_flags_t flags)
-{
-    int e;
-    e = (flags >> BF_EXP_BITS_SHIFT) & BF_EXP_BITS_MASK;
-    if (e == BF_EXP_BITS_MASK)
-        return BF_EXP_BITS_MAX + 1;
-    else
-        return BF_EXP_BITS_MAX - e;
-}
-
-static inline bf_flags_t bf_set_exp_bits(int n)
-{
-    return ((BF_EXP_BITS_MAX - n) & BF_EXP_BITS_MASK) << BF_EXP_BITS_SHIFT;
-}
-
-/* returned status */
-#define BF_ST_INVALID_OP  (1 << 0)
-#define BF_ST_DIVIDE_ZERO (1 << 1)
-#define BF_ST_OVERFLOW    (1 << 2)
-#define BF_ST_UNDERFLOW   (1 << 3)
-#define BF_ST_INEXACT     (1 << 4)
-/* indicate that a memory allocation error occured. NaN is returned */
-#define BF_ST_MEM_ERROR   (1 << 5)
-
-#define BF_RADIX_MAX 36 /* maximum radix for bf_atof() and bf_ftoa() */
-
-static inline slimb_t bf_max(slimb_t a, slimb_t b)
-{
-    if (a > b)
-        return a;
-    else
-        return b;
-}
-
-static inline slimb_t bf_min(slimb_t a, slimb_t b)
-{
-    if (a < b)
-        return a;
-    else
-        return b;
-}
-
-void bf_context_init(bf_context_t *s, bf_realloc_func_t *realloc_func,
-                     void *realloc_opaque);
-void bf_context_end(bf_context_t *s);
-/* free memory allocated for the bf cache data */
-void bf_clear_cache(bf_context_t *s);
-
-static inline void *bf_realloc(bf_context_t *s, void *ptr, size_t size)
-{
-    return s->realloc_func(s->realloc_opaque, ptr, size);
-}
-
-/* 'size' must be != 0 */
-static inline void *bf_malloc(bf_context_t *s, size_t size)
-{
-    return bf_realloc(s, NULL, size);
-}
-
-static inline void bf_free(bf_context_t *s, void *ptr)
-{
-    /* must test ptr otherwise equivalent to malloc(0) */
-    if (ptr)
-        bf_realloc(s, ptr, 0);
-}
-
-void bf_init(bf_context_t *s, bf_t *r);
-
-static inline void bf_delete(bf_t *r)
-{
-    bf_context_t *s = r->ctx;
-    /* we accept to delete a zeroed bf_t structure */
-    if (s && r->tab) {
-        bf_realloc(s, r->tab, 0);
-    }
-}
-
-static inline void bf_neg(bf_t *r)
-{
-    r->sign ^= 1;
-}
-
-static inline int bf_is_finite(const bf_t *a)
-{
-    return (a->expn < BF_EXP_INF);
-}
-
-static inline int bf_is_nan(const bf_t *a)
-{
-    return (a->expn == BF_EXP_NAN);
-}
-
-static inline int bf_is_zero(const bf_t *a)
-{
-    return (a->expn == BF_EXP_ZERO);
-}
-
-static inline void bf_memcpy(bf_t *r, const bf_t *a)
-{
-    *r = *a;
-}
-
-int bf_set_ui(bf_t *r, uint64_t a);
-int bf_set_si(bf_t *r, int64_t a);
-void bf_set_nan(bf_t *r);
-void bf_set_zero(bf_t *r, int is_neg);
-void bf_set_inf(bf_t *r, int is_neg);
-int bf_set(bf_t *r, const bf_t *a);
-void bf_move(bf_t *r, bf_t *a);
-int bf_get_float64(const bf_t *a, double *pres, bf_rnd_t rnd_mode);
-int bf_set_float64(bf_t *a, double d);
-
-int bf_cmpu(const bf_t *a, const bf_t *b);
-int bf_cmp_full(const bf_t *a, const bf_t *b);
-int bf_cmp(const bf_t *a, const bf_t *b);
-static inline int bf_cmp_eq(const bf_t *a, const bf_t *b)
-{
-    return bf_cmp(a, b) == 0;
-}
-
-static inline int bf_cmp_le(const bf_t *a, const bf_t *b)
-{
-    return bf_cmp(a, b) <= 0;
-}
-
-static inline int bf_cmp_lt(const bf_t *a, const bf_t *b)
-{
-    return bf_cmp(a, b) < 0;
-}
-
-int bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, bf_flags_t flags);
-int bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, bf_flags_t flags);
-int bf_add_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec, bf_flags_t flags);
-int bf_mul(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, bf_flags_t flags);
-int bf_mul_ui(bf_t *r, const bf_t *a, uint64_t b1, limb_t prec, bf_flags_t flags);
-int bf_mul_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec,
-              bf_flags_t flags);
-int bf_mul_2exp(bf_t *r, slimb_t e, limb_t prec, bf_flags_t flags);
-int bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, bf_flags_t flags);
-#define BF_DIVREM_EUCLIDIAN BF_RNDF
-int bf_divrem(bf_t *q, bf_t *r, const bf_t *a, const bf_t *b,
-              limb_t prec, bf_flags_t flags, int rnd_mode);
-int bf_rem(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-           bf_flags_t flags, int rnd_mode);
-int bf_remquo(slimb_t *pq, bf_t *r, const bf_t *a, const bf_t *b, limb_t prec,
-              bf_flags_t flags, int rnd_mode);
-/* round to integer with infinite precision */
-int bf_rint(bf_t *r, int rnd_mode);
-int bf_round(bf_t *r, limb_t prec, bf_flags_t flags);
-int bf_sqrtrem(bf_t *r, bf_t *rem1, const bf_t *a);
-int bf_sqrt(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-slimb_t bf_get_exp_min(const bf_t *a);
-int bf_logic_or(bf_t *r, const bf_t *a, const bf_t *b);
-int bf_logic_xor(bf_t *r, const bf_t *a, const bf_t *b);
-int bf_logic_and(bf_t *r, const bf_t *a, const bf_t *b);
-
-/* additional flags for bf_atof */
-/* do not accept hex radix prefix (0x or 0X) if radix = 0 or radix = 16 */
-#define BF_ATOF_NO_HEX       (1 << 16)
-/* accept binary (0b or 0B) or octal (0o or 0O) radix prefix if radix = 0 */
-#define BF_ATOF_BIN_OCT      (1 << 17)
-/* Do not parse NaN or Inf */
-#define BF_ATOF_NO_NAN_INF   (1 << 18)
-/* return the exponent separately */
-#define BF_ATOF_EXPONENT       (1 << 19)
-
-int bf_atof(bf_t *a, const char *str, const char **pnext, int radix,
-            limb_t prec, bf_flags_t flags);
-/* this version accepts prec = BF_PREC_INF and returns the radix
-   exponent */
-int bf_atof2(bf_t *r, slimb_t *pexponent,
-             const char *str, const char **pnext, int radix,
-             limb_t prec, bf_flags_t flags);
-int bf_mul_pow_radix(bf_t *r, const bf_t *T, limb_t radix,
-                     slimb_t expn, limb_t prec, bf_flags_t flags);
-
-
-/* Conversion of floating point number to string. Return a null
-   terminated string or NULL if memory error. *plen contains its
-   length if plen != NULL.  The exponent letter is "e" for base 10,
-   "p" for bases 2, 8, 16 with a binary exponent and "@" for the other
-   bases. */
-
-#define BF_FTOA_FORMAT_MASK (3 << 16)
-
-/* fixed format: prec significant digits rounded with (flags &
-   BF_RND_MASK). Exponential notation is used if too many zeros are
-   needed.*/
-#define BF_FTOA_FORMAT_FIXED (0 << 16)
-/* fractional format: prec digits after the decimal point rounded with
-   (flags & BF_RND_MASK) */
-#define BF_FTOA_FORMAT_FRAC  (1 << 16)
-/* free format:
-
-   For binary radices with bf_ftoa() and for bfdec_ftoa(): use the minimum
-   number of digits to represent 'a'. The precision and the rounding
-   mode are ignored.
-
-   For the non binary radices with bf_ftoa(): use as many digits as
-   necessary so that bf_atof() return the same number when using
-   precision 'prec', rounding to nearest and the subnormal
-   configuration of 'flags'. The result is meaningful only if 'a' is
-   already rounded to 'prec' bits. If the subnormal flag is set, the
-   exponent in 'flags' must also be set to the desired exponent range.
-*/
-#define BF_FTOA_FORMAT_FREE  (2 << 16)
-/* same as BF_FTOA_FORMAT_FREE but uses the minimum number of digits
-   (takes more computation time). Identical to BF_FTOA_FORMAT_FREE for
-   binary radices with bf_ftoa() and for bfdec_ftoa(). */
-#define BF_FTOA_FORMAT_FREE_MIN (3 << 16)
-
-/* force exponential notation for fixed or free format */
-#define BF_FTOA_FORCE_EXP    (1 << 20)
-/* add 0x prefix for base 16, 0o prefix for base 8 or 0b prefix for
-   base 2 if non zero value */
-#define BF_FTOA_ADD_PREFIX   (1 << 21)
-/* return "Infinity" instead of "Inf" and add a "+" for positive
-   exponents */
-#define BF_FTOA_JS_QUIRKS    (1 << 22)
-
-char *bf_ftoa(size_t *plen, const bf_t *a, int radix, limb_t prec,
-              bf_flags_t flags);
-
-/* modulo 2^n instead of saturation. NaN and infinity return 0 */
-#define BF_GET_INT_MOD (1 << 0)
-int bf_get_int32(int *pres, const bf_t *a, int flags);
-int bf_get_int64(int64_t *pres, const bf_t *a, int flags);
-int bf_get_uint64(uint64_t *pres, const bf_t *a);
-
-/* the following functions are exported for testing only. */
-void mp_print_str(const char *str, const limb_t *tab, limb_t n);
-void bf_print_str(const char *str, const bf_t *a);
-int bf_resize(bf_t *r, limb_t len);
-int bf_get_fft_size(int *pdpl, int *pnb_mods, limb_t len);
-int bf_normalize_and_round(bf_t *r, limb_t prec1, bf_flags_t flags);
-int bf_can_round(const bf_t *a, slimb_t prec, bf_rnd_t rnd_mode, slimb_t k);
-slimb_t bf_mul_log2_radix(slimb_t a1, unsigned int radix, int is_inv,
-                          int is_ceil1);
-int mp_mul(bf_context_t *s, limb_t *result,
-           const limb_t *op1, limb_t op1_size,
-           const limb_t *op2, limb_t op2_size);
-limb_t mp_add(limb_t *res, const limb_t *op1, const limb_t *op2,
-              limb_t n, limb_t carry);
-limb_t mp_add_ui(limb_t *tab, limb_t b, size_t n);
-int mp_sqrtrem(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n);
-int mp_recip(bf_context_t *s, limb_t *tabr, const limb_t *taba, limb_t n);
-limb_t bf_isqrt(limb_t a);
-
-/* transcendental functions */
-int bf_const_log2(bf_t *T, limb_t prec, bf_flags_t flags);
-int bf_const_pi(bf_t *T, limb_t prec, bf_flags_t flags);
-int bf_exp(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-int bf_log(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-#define BF_POW_JS_QUIRKS (1 << 16) /* (+/-1)^(+/-Inf) = NaN, 1^NaN = NaN */
-int bf_pow(bf_t *r, const bf_t *x, const bf_t *y, limb_t prec, bf_flags_t flags);
-int bf_cos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-int bf_sin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-int bf_tan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-int bf_atan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-int bf_atan2(bf_t *r, const bf_t *y, const bf_t *x,
-             limb_t prec, bf_flags_t flags);
-int bf_asin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-int bf_acos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags);
-
-/* decimal floating point */
-
-static inline void bfdec_init(bf_context_t *s, bfdec_t *r)
-{
-    bf_init(s, (bf_t *)r);
-}
-static inline void bfdec_delete(bfdec_t *r)
-{
-    bf_delete((bf_t *)r);
-}
-
-static inline void bfdec_neg(bfdec_t *r)
-{
-    r->sign ^= 1;
-}
-
-static inline int bfdec_is_finite(const bfdec_t *a)
-{
-    return (a->expn < BF_EXP_INF);
-}
-
-static inline int bfdec_is_nan(const bfdec_t *a)
-{
-    return (a->expn == BF_EXP_NAN);
-}
-
-static inline int bfdec_is_zero(const bfdec_t *a)
-{
-    return (a->expn == BF_EXP_ZERO);
-}
-
-static inline void bfdec_memcpy(bfdec_t *r, const bfdec_t *a)
-{
-    bf_memcpy((bf_t *)r, (const bf_t *)a);
-}
-
-int bfdec_set_ui(bfdec_t *r, uint64_t a);
-int bfdec_set_si(bfdec_t *r, int64_t a);
-
-static inline void bfdec_set_nan(bfdec_t *r)
-{
-    bf_set_nan((bf_t *)r);
-}
-static inline void bfdec_set_zero(bfdec_t *r, int is_neg)
-{
-    bf_set_zero((bf_t *)r, is_neg);
-}
-static inline void bfdec_set_inf(bfdec_t *r, int is_neg)
-{
-    bf_set_inf((bf_t *)r, is_neg);
-}
-static inline int bfdec_set(bfdec_t *r, const bfdec_t *a)
-{
-    return bf_set((bf_t *)r, (bf_t *)a);
-}
-static inline void bfdec_move(bfdec_t *r, bfdec_t *a)
-{
-    bf_move((bf_t *)r, (bf_t *)a);
-}
-static inline int bfdec_cmpu(const bfdec_t *a, const bfdec_t *b)
-{
-    return bf_cmpu((const bf_t *)a, (const bf_t *)b);
-}
-static inline int bfdec_cmp_full(const bfdec_t *a, const bfdec_t *b)
-{
-    return bf_cmp_full((const bf_t *)a, (const bf_t *)b);
-}
-static inline int bfdec_cmp(const bfdec_t *a, const bfdec_t *b)
-{
-    return bf_cmp((const bf_t *)a, (const bf_t *)b);
-}
-static inline int bfdec_cmp_eq(const bfdec_t *a, const bfdec_t *b)
-{
-    return bfdec_cmp(a, b) == 0;
-}
-static inline int bfdec_cmp_le(const bfdec_t *a, const bfdec_t *b)
-{
-    return bfdec_cmp(a, b) <= 0;
-}
-static inline int bfdec_cmp_lt(const bfdec_t *a, const bfdec_t *b)
-{
-    return bfdec_cmp(a, b) < 0;
-}
-
-int bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags);
-int bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags);
-int bfdec_add_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec,
-                 bf_flags_t flags);
-int bfdec_mul(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags);
-int bfdec_mul_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec,
-                 bf_flags_t flags);
-int bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags);
-int bfdec_divrem(bfdec_t *q, bfdec_t *r, const bfdec_t *a, const bfdec_t *b,
-                 limb_t prec, bf_flags_t flags, int rnd_mode);
-int bfdec_rem(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec,
-              bf_flags_t flags, int rnd_mode);
-int bfdec_rint(bfdec_t *r, int rnd_mode);
-int bfdec_sqrt(bfdec_t *r, const bfdec_t *a, limb_t prec, bf_flags_t flags);
-int bfdec_round(bfdec_t *r, limb_t prec, bf_flags_t flags);
-int bfdec_get_int32(int *pres, const bfdec_t *a);
-int bfdec_pow_ui(bfdec_t *r, const bfdec_t *a, limb_t b);
-
-char *bfdec_ftoa(size_t *plen, const bfdec_t *a, limb_t prec, bf_flags_t flags);
-int bfdec_atof(bfdec_t *r, const char *str, const char **pnext,
-               limb_t prec, bf_flags_t flags);
-
-/* the following functions are exported for testing only. */
-extern const limb_t mp_pow_dec[LIMB_DIGITS + 1];
-void bfdec_print_str(const char *str, const bfdec_t *a);
-static inline int bfdec_resize(bfdec_t *r, limb_t len)
-{
-    return bf_resize((bf_t *)r, len);
-}
-int bfdec_normalize_and_round(bfdec_t *r, limb_t prec1, bf_flags_t flags);
-
-#ifdef __cplusplus
-} /* extern "C" { */
-#endif
-
-#endif /* LIBBF_H */
diff --git a/lib/monoucha0/monoucha/qjs/libregexp.c b/lib/monoucha0/monoucha/qjs/libregexp.c
index 693acbbf..da79dedb 100644
--- a/lib/monoucha0/monoucha/qjs/libregexp.c
+++ b/lib/monoucha0/monoucha/qjs/libregexp.c
@@ -53,6 +53,9 @@ typedef enum {
 
 #define CAPTURE_COUNT_MAX 255
 #define STACK_SIZE_MAX 255
+/* must be large enough to have a negligible runtime cost and small
+   enough to call the interrupt callback often. */
+#define INTERRUPT_COUNTER_INIT 10000
 
 /* unicode code points */
 #define CP_LS   0x2028
@@ -2012,6 +2015,7 @@ typedef struct {
     bool multi_line;
     bool ignore_case;
     bool is_unicode;
+    int interrupt_counter;
     void *opaque; /* used for stack overflow check */
 
     size_t state_size;
@@ -2058,7 +2062,17 @@ static int push_state(REExecContext *s,
     return 0;
 }
 
-/* return 1 if match, 0 if not match or -1 if error. */
+static int lre_poll_timeout(REExecContext *s)
+{
+    if (unlikely(--s->interrupt_counter <= 0)) {
+        s->interrupt_counter = INTERRUPT_COUNTER_INIT;
+        if (lre_check_timeout(s->opaque))
+            return LRE_RET_TIMEOUT;
+    }
+    return 0;
+}
+
+/* return 1 if match, 0 if not match or < 0 if error. */
 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                                    StackInt *stack, int stack_len,
                                    const uint8_t *pc, const uint8_t *cptr,
@@ -2089,6 +2103,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 ret = 0;
             recurse:
                 for(;;) {
+                    if (lre_poll_timeout(s))
+                        return LRE_RET_TIMEOUT;
                     if (s->state_stack_len == 0)
                         return ret;
                     rs = (REExecState *)(s->state_stack +
@@ -2182,7 +2198,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 ret = push_state(s, capture, stack, stack_len,
                                  pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
                 if (ret < 0)
-                    return -1;
+                    return LRE_RET_MEMORY_ERROR;
                 break;
             }
         case REOP_lookahead:
@@ -2194,12 +2210,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                              RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
                              0);
             if (ret < 0)
-                return -1;
+                return LRE_RET_MEMORY_ERROR;
             break;
 
         case REOP_goto:
             val = get_u32(pc);
             pc += 4 + (int)val;
+            if (lre_poll_timeout(s))
+                return LRE_RET_TIMEOUT;
             break;
         case REOP_line_start:
             if (cptr == s->cbuf)
@@ -2264,6 +2282,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
             pc += 4;
             if (--stack[stack_len - 1] != 0) {
                 pc += (int)val;
+                if (lre_poll_timeout(s))
+                    return LRE_RET_TIMEOUT;
             }
             break;
         case REOP_push_char_pos:
@@ -2438,9 +2458,12 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
 
                 q = 0;
                 for(;;) {
+                    if (lre_poll_timeout(s))
+                        return LRE_RET_TIMEOUT;
                     res = lre_exec_backtrack(s, capture, stack, stack_len,
                                              pc1, cptr, true);
-                    if (res == -1)
+                    if (res == LRE_RET_MEMORY_ERROR ||
+                        res == LRE_RET_TIMEOUT)
                         return res;
                     if (!res)
                         break;
@@ -2458,7 +2481,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                                      RE_EXEC_STATE_GREEDY_QUANT,
                                      q - quant_min);
                     if (ret < 0)
-                        return -1;
+                        return LRE_RET_MEMORY_ERROR;
                 }
             }
             break;
@@ -2468,7 +2491,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
     }
 }
 
-/* Return 1 if match, 0 if not match or -1 if error. cindex is the
+/* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the
    starting position of the match and must be such as 0 <= cindex <=
    clen. */
 int lre_exec(uint8_t **capture,
@@ -2492,6 +2515,7 @@ int lre_exec(uint8_t **capture,
     s->cbuf_type = cbuf_type;
     if (s->cbuf_type == 1 && s->is_unicode)
         s->cbuf_type = 2;
+    s->interrupt_counter = INTERRUPT_COUNTER_INIT;
     s->opaque = opaque;
 
     s->state_size = sizeof(REExecState) +
diff --git a/lib/monoucha0/monoucha/qjs/libregexp.h b/lib/monoucha0/monoucha/qjs/libregexp.h
index 0b8fec52..898e9a7a 100644
--- a/lib/monoucha0/monoucha/qjs/libregexp.h
+++ b/lib/monoucha0/monoucha/qjs/libregexp.h
@@ -43,6 +43,9 @@ extern "C" {
 #define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */
 #define LRE_FLAG_UNICODE_SETS (1 << 8)
 
+#define LRE_RET_MEMORY_ERROR (-1)
+#define LRE_RET_TIMEOUT      (-2)
+
 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
                      const char *buf, size_t buf_len, int re_flags,
                      void *opaque);
@@ -60,6 +63,8 @@ void lre_byte_swap(uint8_t *buf, size_t len, bool is_byte_swapped);
 
 /* must be provided by the user */
 bool lre_check_stack_overflow(void *opaque, size_t alloca_size);
+/* must be provided by the user, return non zero if time out */
+int lre_check_timeout(void *opaque);
 void *lre_realloc(void *opaque, void *ptr, size_t size);
 
 /* JS identifier test */
diff --git a/lib/monoucha0/monoucha/qjs/quickjs-atom.h b/lib/monoucha0/monoucha/qjs/quickjs-atom.h
index 358fe230..67e17e7e 100644
--- a/lib/monoucha0/monoucha/qjs/quickjs-atom.h
+++ b/lib/monoucha0/monoucha/qjs/quickjs-atom.h
@@ -154,6 +154,7 @@ DEF(brand, "<brand>")
 DEF(hash_constructor, "#constructor")
 DEF(as, "as")
 DEF(from, "from")
+DEF(fromAsync, "fromAsync")
 DEF(meta, "meta")
 DEF(_default_, "*default*")
 DEF(_star_, "*")
diff --git a/lib/monoucha0/monoucha/qjs/quickjs-opcode.h b/lib/monoucha0/monoucha/qjs/quickjs-opcode.h
index 42c3faee..bd5be754 100644
--- a/lib/monoucha0/monoucha/qjs/quickjs-opcode.h
+++ b/lib/monoucha0/monoucha/qjs/quickjs-opcode.h
@@ -264,6 +264,7 @@ DEF(      strict_eq, 1, 2, 1, none)
 DEF(     strict_neq, 1, 2, 1, none)
 DEF(is_undefined_or_null, 1, 1, 1, none)
 DEF(     private_in, 1, 2, 1, none)
+DEF(push_bigint_i32, 5, 0, 1, i32)
 /* must be the last non short and non temporary opcode */
 DEF(            nop, 1, 0, 0, none)
 
diff --git a/lib/monoucha0/monoucha/qjs/quickjs.c b/lib/monoucha0/monoucha/qjs/quickjs.c
index 8909a934..80c492bb 100644
--- a/lib/monoucha0/monoucha/qjs/quickjs.c
+++ b/lib/monoucha0/monoucha/qjs/quickjs.c
@@ -1,8 +1,8 @@
 /*
  * QuickJS Javascript Engine
  *
- * Copyright (c) 2017-2024 Fabrice Bellard
- * Copyright (c) 2017-2024 Charlie Gordon
+ * Copyright (c) 2017-2025 Fabrice Bellard
+ * Copyright (c) 2017-2025 Charlie Gordon
  * Copyright (c) 2023-2025 Ben Noordhuis
  * Copyright (c) 2023-2025 Saúl Ibarra Corretgé
  *
@@ -47,7 +47,7 @@
 #include "list.h"
 #include "quickjs.h"
 #include "libregexp.h"
-#include "libbf.h"
+#include "xsum.h"
 
 #if defined(EMSCRIPTEN) || defined(_MSC_VER)
 #define DIRECT_DISPATCH  0
@@ -237,6 +237,11 @@ typedef struct JSRuntimeFinalizerState {
     void *arg;
 } JSRuntimeFinalizerState;
 
+typedef struct JSValueLink {
+    struct JSValueLink *next;
+    JSValueConst value;
+} JSValueLink;
+
 struct JSRuntime {
     JSMallocFunctions mf;
     JSMallocState malloc_state;
@@ -286,6 +291,12 @@ struct JSRuntime {
     JSInterruptHandler *interrupt_handler;
     void *interrupt_opaque;
 
+    JSPromiseHook *promise_hook;
+    void *promise_hook_opaque;
+    // for smuggling the parent promise from js_promise_then
+    // to js_promise_constructor
+    JSValueLink *parent_promise;
+
     JSHostPromiseRejectionTracker *host_promise_rejection_tracker;
     void *host_promise_rejection_tracker_opaque;
 
@@ -308,7 +319,6 @@ struct JSRuntime {
     int shape_hash_size;
     int shape_hash_count; /* number of hashed shapes */
     JSShape **shape_hash;
-    bf_context_t bf_ctx;
     void *user_opaque;
     void *libc_opaque;
     JSRuntimeFinalizerState *finalizers;
@@ -369,15 +379,7 @@ typedef struct JSVarRef {
         struct {
             int __gc_ref_count; /* corresponds to header.ref_count */
             uint8_t __gc_mark; /* corresponds to header.mark/gc_obj_type */
-
-            /* 0 : the JSVarRef is on the stack. header.link is an element
-               of JSStackFrame.var_ref_list.
-               1 : the JSVarRef is detached. header.link has the normal meanning
-            */
-            uint8_t is_detached : 1;
-            uint8_t is_arg : 1;
-            uint16_t var_idx; /* index of the corresponding function variable on
-                                 the stack */
+            bool is_detached;
         };
     };
     JSValue *pvalue; /* pointer to the value, either on the stack or
@@ -389,19 +391,48 @@ typedef struct JSRefCountHeader {
     int ref_count;
 } JSRefCountHeader;
 
-/* the same structure is used for big integers.
-   Big integers are never infinite or NaNs */
+/* bigint */
+typedef int32_t js_slimb_t;
+typedef uint32_t js_limb_t;
+typedef int64_t js_sdlimb_t;
+typedef uint64_t js_dlimb_t;
+
+#define JS_LIMB_DIGITS 9
+
+/* Must match the size of short_big_int in JSValueUnion */
+#define JS_LIMB_BITS 32
+#define JS_SHORT_BIG_INT_BITS JS_LIMB_BITS
+#define JS_BIGINT_MAX_SIZE ((1024 * 1024) / JS_LIMB_BITS) /* in limbs */
+#define JS_SHORT_BIG_INT_MIN INT32_MIN
+#define JS_SHORT_BIG_INT_MAX INT32_MAX
+
+
 typedef struct JSBigInt {
     JSRefCountHeader header; /* must come first, 32-bit */
-    bf_t num;
+    uint32_t len; /* number of limbs, >= 1 */
+    js_limb_t tab[]; /* two's complement representation, always
+                        normalized so that 'len' is the minimum
+                        possible length >= 1 */
 } JSBigInt;
 
+/* this bigint structure can hold a 64 bit integer */
+typedef struct {
+    js_limb_t big_int_buf[sizeof(JSBigInt) / sizeof(js_limb_t)]; /* for JSBigInt */
+    /* must come just after */
+    js_limb_t tab[(64 + JS_LIMB_BITS - 1) / JS_LIMB_BITS];
+} JSBigIntBuf;
+
 typedef enum {
     JS_AUTOINIT_ID_PROTOTYPE,
     JS_AUTOINIT_ID_MODULE_NS,
     JS_AUTOINIT_ID_PROP,
+    JS_AUTOINIT_ID_BYTECODE,
 } JSAutoInitIDEnum;
 
+enum {
+    JS_BUILTIN_ARRAY_FROMASYNC = 1,
+};
+
 /* must be large enough to have a negligible runtime cost and small
    enough to call the interrupt callback often. */
 #define JS_INTERRUPT_COUNTER_INIT 10000
@@ -440,7 +471,7 @@ struct JSContext {
     double time_origin;
 
     uint64_t random_state;
-    bf_context_t *bf_ctx;   /* points to rt->bf_ctx, shared by all contexts */
+
     /* when the counter reaches zero, JSRutime.interrupt_handler is called */
     int interrupt_counter;
 
@@ -478,6 +509,26 @@ typedef struct JSWeakRefRecord {
     } u;
 } JSWeakRefRecord;
 
+typedef struct JSMapRecord {
+    int ref_count; /* used during enumeration to avoid freeing the record */
+    bool empty; /* true if the record is deleted */
+    struct JSMapState *map;
+    struct list_head link;
+    struct list_head hash_link;
+    JSValue key;
+    JSValue value;
+} JSMapRecord;
+
+typedef struct JSMapState {
+    bool is_weak; /* true if WeakSet/WeakMap */
+    struct list_head records; /* list of JSMapRecord.link */
+    uint32_t record_count;
+    struct list_head *hash_table;
+    uint32_t hash_size; /* must be a power of two */
+    uint32_t record_count_threshold; /* count at which a hash table
+                                        resize is needed */
+} JSMapState;
+
 enum {
     JS_ATOM_TYPE_STRING = 1,
     JS_ATOM_TYPE_GLOBAL_SYMBOL,
@@ -1122,6 +1173,12 @@ static int JS_NewClass1(JSRuntime *rt, JSClassID class_id,
                         const JSClassDef *class_def, JSAtom name);
 static JSValue js_array_push(JSContext *ctx, JSValueConst this_val,
                              int argc, JSValueConst *argv, int unshift);
+static JSValue js_array_constructor(JSContext *ctx, JSValueConst new_target,
+                                    int argc, JSValueConst *argv);
+static JSValue js_error_constructor(JSContext *ctx, JSValueConst new_target,
+                                    int argc, JSValueConst *argv, int magic);
+static JSValue js_object_defineProperty(JSContext *ctx, JSValueConst this_val,
+                                        int argc, JSValueConst *argv, int magic);
 
 typedef enum JSStrictEqModeEnum {
     JS_EQ_STRICT,
@@ -1137,18 +1194,7 @@ static bool js_same_value_zero(JSContext *ctx, JSValueConst op1, JSValueConst op
 static JSValue JS_ToObjectFree(JSContext *ctx, JSValue val);
 static JSProperty *add_property(JSContext *ctx,
                                 JSObject *p, JSAtom prop, int prop_flags);
-static JSValue JS_NewBigInt(JSContext *ctx);
-static inline bf_t *JS_GetBigInt(JSValueConst val)
-{
-    JSBigInt *p = JS_VALUE_GET_PTR(val);
-    return &p->num;
-}
-static JSValue JS_CompactBigInt1(JSContext *ctx, JSValue val);
-static JSValue JS_CompactBigInt(JSContext *ctx, JSValue val);
 static int JS_ToBigInt64Free(JSContext *ctx, int64_t *pres, JSValue val);
-static bf_t *JS_ToBigInt(JSContext *ctx, bf_t *buf, JSValueConst val);
-static bf_t *JS_ToBigInt1(JSContext *ctx, bf_t *buf, JSValueConst val);
-static void JS_FreeBigInt(JSContext *ctx, bf_t *a, bf_t *buf);
 JSValue JS_ThrowOutOfMemory(JSContext *ctx);
 static JSValue JS_ThrowTypeErrorRevokedProxy(JSContext *ctx);
 static JSValue js_proxy_getPrototypeOf(JSContext *ctx, JSValueConst obj);
@@ -1505,13 +1551,6 @@ void *js_mallocz_rt(JSRuntime *rt, size_t size)
     return js_calloc_rt(rt, 1, size);
 }
 
-/* called by libbf */
-static void *js_bf_realloc(void *opaque, void *ptr, size_t size)
-{
-    JSRuntime *rt = opaque;
-    return js_realloc_rt(rt, ptr, size);
-}
-
 /* Throw out of memory in case of error */
 void *js_calloc(JSContext *ctx, size_t count, size_t size)
 {
@@ -1691,8 +1730,8 @@ static JSClassShortDef const js_std_class_def[] = {
     { JS_ATOM_BigInt, js_object_data_finalizer, js_object_data_mark },      /* JS_CLASS_BIG_INT */
     { JS_ATOM_Map, js_map_finalizer, js_map_mark },             /* JS_CLASS_MAP */
     { JS_ATOM_Set, js_map_finalizer, js_map_mark },             /* JS_CLASS_SET */
-    { JS_ATOM_WeakMap, js_map_finalizer, js_map_mark },         /* JS_CLASS_WEAKMAP */
-    { JS_ATOM_WeakSet, js_map_finalizer, js_map_mark },         /* JS_CLASS_WEAKSET */
+    { JS_ATOM_WeakMap, js_map_finalizer, NULL },         /* JS_CLASS_WEAKMAP */
+    { JS_ATOM_WeakSet, js_map_finalizer, NULL },         /* JS_CLASS_WEAKSET */
     { JS_ATOM_Iterator, NULL, NULL },                           /* JS_CLASS_ITERATOR */
     { JS_ATOM_IteratorHelper, js_iterator_helper_finalizer, js_iterator_helper_mark }, /* JS_CLASS_ITERATOR_HELPER */
     { JS_ATOM_IteratorWrap, js_iterator_wrap_finalizer, js_iterator_wrap_mark }, /* JS_CLASS_ITERATOR_WRAP */
@@ -1770,8 +1809,6 @@ JSRuntime *JS_NewRuntime2(const JSMallocFunctions *mf, void *opaque)
     rt->malloc_state = ms;
     rt->malloc_gc_threshold = 256 * 1024;
 
-    bf_context_init(&rt->bf_ctx, js_bf_realloc, rt);
-
     init_list_head(&rt->context_list);
     init_list_head(&rt->gc_obj_list);
     init_list_head(&rt->gc_zero_ref_count_list);
@@ -2127,8 +2164,6 @@ void JS_FreeRuntime(JSRuntime *rt)
     }
     js_free_rt(rt, rt->class_array);
 
-    bf_context_end(&rt->bf_ctx);
-
 #ifdef ENABLE_DUMPS // JS_DUMP_ATOM_LEAKS
     /* only the atoms defined in JS_InitAtoms() should be left */
     if (check_dump_flag(rt, JS_DUMP_ATOM_LEAKS)) {
@@ -2275,7 +2310,6 @@ JSContext *JS_NewContextRaw(JSRuntime *rt)
     }
     ctx->rt = rt;
     list_add_tail(&ctx->link, &rt->context_list);
-    ctx->bf_ctx = &rt->bf_ctx;
     for(i = 0; i < rt->class_count; i++)
         ctx->class_proto[i] = JS_NULL;
     ctx->array_ctor = JS_NULL;
@@ -3445,6 +3479,8 @@ const char *JS_AtomToCString(JSContext *ctx, JSAtom atom)
     return cstr;
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 /* return a string atom containing name concatenated with str1 */
 /* `str1` may be pure ASCII or UTF-8 encoded */
 // TODO(chqrlie): use string concatenation instead of UTF-8 conversion
@@ -3487,6 +3523,8 @@ static JSAtom js_atom_concat_num(JSContext *ctx, JSAtom name, uint32_t n)
     return js_atom_concat_str(ctx, name, buf);
 }
 
+#endif // QJS_DISABLE_PARSER
+
 static inline bool JS_IsEmptyString(JSValueConst v)
 {
     return JS_VALUE_GET_TAG(v) == JS_TAG_STRING && JS_VALUE_GET_STRING(v)->len == 0;
@@ -4041,6 +4079,19 @@ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
     return JS_MKPTR(JS_TAG_STRING, str);
 }
 
+JSValue JS_NewTwoByteString(JSContext *ctx, const uint16_t *buf, size_t len)
+{
+    JSString *str;
+
+    if (!len)
+        return JS_AtomToString(ctx, JS_ATOM_empty_string);
+    str = js_alloc_string(ctx, len, 1);
+    if (!str)
+        return JS_EXCEPTION;
+    memcpy(str16(str), buf, len * sizeof(*buf));
+    return JS_MKPTR(JS_TAG_STRING, str);
+}
+
 static JSValue JS_ConcatString3(JSContext *ctx, const char *str1,
                                 JSValue str2, const char *str3)
 {
@@ -5778,9 +5829,8 @@ static void js_free_value_rt(JSRuntime *rt, JSValue v)
         break;
     case JS_TAG_BIG_INT:
         {
-            JSBigInt *bf = JS_VALUE_GET_PTR(v);
-            bf_delete(&bf->num);
-            js_free_rt(rt, bf);
+            JSBigInt *p = JS_VALUE_GET_PTR(v);
+            js_free_rt(rt, p);
         }
         break;
     case JS_TAG_SYMBOL:
@@ -5839,6 +5889,22 @@ void JS_MarkValue(JSRuntime *rt, JSValueConst val, JS_MarkFunc *mark_func)
     }
 }
 
+static void mark_weak_map_value(JSRuntime *rt, JSWeakRefRecord *first_weak_ref, JS_MarkFunc *mark_func) {
+    JSWeakRefRecord *wr;
+    JSMapRecord *mr;
+    JSMapState *s;
+
+    for (wr = first_weak_ref; wr != NULL; wr = wr->next_weak_ref) {
+        if (wr->kind == JS_WEAK_REF_KIND_MAP) {
+            mr = wr->u.map_record;
+            s = mr->map;
+            assert(s->is_weak);
+            assert(!mr->empty); /* no iterator on WeakMap/WeakSet */
+            JS_MarkValue(rt, mr->value, mark_func);
+        }
+    }
+}
+
 static void mark_children(JSRuntime *rt, JSGCObjectHeader *gp,
                           JS_MarkFunc *mark_func)
 {
@@ -5878,6 +5944,10 @@ static void mark_children(JSRuntime *rt, JSGCObjectHeader *gp,
                 prs++;
             }
 
+            if (unlikely(p->first_weak_ref)) {
+                mark_weak_map_value(rt, p->first_weak_ref, mark_func);
+            }
+
             if (p->class_id != JS_CLASS_OBJECT) {
                 JSClassGCMark *gc_mark;
                 gc_mark = rt->class_array[p->class_id].gc_mark;
@@ -7219,15 +7289,19 @@ static JSValue JS_ThrowTypeErrorInvalidClass(JSContext *ctx, int class_id)
     return JS_ThrowTypeErrorAtom(ctx, "%s object expected", name);
 }
 
+static void JS_ThrowInterrupted(JSContext *ctx)
+{
+    JS_ThrowInternalError(ctx, "interrupted");
+    JS_SetUncatchableError(ctx, ctx->rt->current_exception);
+}
+
 static no_inline __exception int __js_poll_interrupts(JSContext *ctx)
 {
     JSRuntime *rt = ctx->rt;
     ctx->interrupt_counter = JS_INTERRUPT_COUNTER_INIT;
     if (rt->interrupt_handler) {
         if (rt->interrupt_handler(rt, rt->interrupt_opaque)) {
-            /* XXX: should set a specific flag to avoid catching */
-            JS_ThrowInternalError(ctx, "interrupted");
-            js_set_uncatchable_error(ctx, ctx->rt->current_exception, true);
+            JS_ThrowInterrupted(ctx);
             return -1;
         }
     }
@@ -7331,6 +7405,7 @@ static JSValueConst JS_GetPrototypePrimitive(JSContext *ctx, JSValueConst val)
 {
     JSValue ret;
     switch(JS_VALUE_GET_NORM_TAG(val)) {
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         ret = ctx->class_proto[JS_CLASS_BIG_INT];
         break;
@@ -7494,13 +7569,55 @@ int JS_IsInstanceOf(JSContext *ctx, JSValueConst val, JSValueConst obj)
     return JS_OrdinaryIsInstanceOf(ctx, val, obj);
 }
 
+#include "builtin-array-fromasync.h"
+
+static JSValue js_bytecode_autoinit(JSContext *ctx, JSObject *p, JSAtom atom,
+                                    void *opaque)
+{
+    switch ((uintptr_t)opaque) {
+    default:
+        abort();
+    case JS_BUILTIN_ARRAY_FROMASYNC:
+        {
+            JSValue obj = JS_ReadObject(ctx, qjsc_builtin_array_fromasync,
+                                        sizeof(qjsc_builtin_array_fromasync),
+                                        JS_READ_OBJ_BYTECODE);
+            if (JS_IsException(obj))
+                return JS_EXCEPTION;
+            JSValue fun = JS_EvalFunction(ctx, obj);
+            if (JS_IsException(fun))
+                return JS_EXCEPTION;
+            assert(JS_IsFunction(ctx, fun));
+            JSValue args[] = {
+                JS_NewCFunction(ctx, js_array_constructor, "Array", 0),
+                JS_NewCFunctionMagic(ctx, js_error_constructor, "TypeError", 1,
+                                     JS_CFUNC_constructor_or_func_magic,
+                                     JS_TYPE_ERROR),
+                JS_AtomToValue(ctx, JS_ATOM_Symbol_asyncIterator),
+                JS_NewCFunctionMagic(ctx, js_object_defineProperty,
+                                     "Object.defineProperty", 3,
+                                     JS_CFUNC_generic_magic, 0),
+                JS_AtomToValue(ctx, JS_ATOM_Symbol_iterator),
+            };
+            JSValue result = JS_Call(ctx, fun, JS_UNDEFINED,
+                                     countof(args), vc(args));
+            for (size_t i = 0; i < countof(args); i++)
+                JS_FreeValue(ctx, args[i]);
+            JS_FreeValue(ctx, fun);
+            return result;
+        }
+    }
+    return JS_UNDEFINED;
+}
+
 /* return the value associated to the autoinit property or an exception */
 typedef JSValue JSAutoInitFunc(JSContext *ctx, JSObject *p, JSAtom atom, void *opaque);
 
-static JSAutoInitFunc *js_autoinit_func_table[] = {
+static JSAutoInitFunc *const js_autoinit_func_table[] = {
     js_instantiate_prototype, /* JS_AUTOINIT_ID_PROTOTYPE */
     js_module_ns_autoinit, /* JS_AUTOINIT_ID_MODULE_NS */
     JS_InstantiateFunctionListItem2, /* JS_AUTOINIT_ID_PROP */
+    js_bytecode_autoinit, /* JS_AUTOINIT_ID_BYTECODE */
 };
 
 /* warning: 'prs' is reallocated after it */
@@ -7534,9 +7651,8 @@ static JSValue JS_GetPropertyInternal(JSContext *ctx, JSValueConst obj,
     JSObject *p;
     JSProperty *pr;
     JSShapeProperty *prs;
-    uint32_t tag, proto_depth;
+    uint32_t tag;
 
-    proto_depth = 0;
     tag = JS_VALUE_GET_TAG(obj);
     if (unlikely(tag != JS_TAG_OBJECT)) {
         switch(tag) {
@@ -7658,7 +7774,6 @@ static JSValue JS_GetPropertyInternal(JSContext *ctx, JSValueConst obj,
                 }
             }
         }
-        proto_depth++;
         p = p->shape->proto;
         if (!p)
             break;
@@ -8988,6 +9103,8 @@ retry:
                 goto retry2;
             } else if (!(prs->flags & JS_PROP_WRITABLE)) {
                 goto read_only_prop;
+            } else {
+                break;
             }
         }
     }
@@ -10442,11 +10559,24 @@ static int JS_ToBoolFree(JSContext *ctx, JSValue val)
             JS_FreeValue(ctx, val);
             return ret;
         }
+    case JS_TAG_SHORT_BIG_INT:
+        return JS_VALUE_GET_SHORT_BIG_INT(val) != 0;
     case JS_TAG_BIG_INT:
         {
             JSBigInt *p = JS_VALUE_GET_PTR(val);
             bool ret;
-            ret = p->num.expn != BF_EXP_ZERO && p->num.expn != BF_EXP_NAN;
+            int i;
+
+            /* fail safe: we assume it is not necessarily
+                normalized. Beginning from the MSB ensures that the
+                test is fast. */
+            ret = false;
+            for(i = p->len - 1; i >= 0; i--) {
+                if (p->tab[i] != 0) {
+                    ret = true;
+                    break;
+                }
+            }
             JS_FreeValue(ctx, val);
             return ret;
         }
@@ -10509,6 +10639,1392 @@ static inline int to_digit(int c)
         return 36;
 }
 
+/* bigint support */
+
+#define ADDC(res, carry_out, op1, op2, carry_in)        \
+do {                                                    \
+    js_limb_t __v, __a, __k, __k1;                      \
+    __v = (op1);                                        \
+    __a = __v + (op2);                                  \
+    __k1 = __a < __v;                                   \
+    __k = (carry_in);                                   \
+    __a = __a + __k;                                    \
+    carry_out = (__a < __k) | __k1;                     \
+    res = __a;                                          \
+} while (0)
+
+/* a != 0 */
+static inline js_limb_t js_limb_clz(js_limb_t a)
+{
+    return clz32(a);
+}
+
+static js_limb_t mp_add(js_limb_t *res, const js_limb_t *op1, const js_limb_t *op2,
+                     js_limb_t n, js_limb_t carry)
+{
+    int i;
+    for(i = 0;i < n; i++) {
+        ADDC(res[i], carry, op1[i], op2[i], carry);
+    }
+    return carry;
+}
+
+static js_limb_t mp_sub(js_limb_t *res, const js_limb_t *op1, const js_limb_t *op2,
+                        int n, js_limb_t carry)
+{
+    int i;
+    js_limb_t k, a, v, k1;
+
+    k = carry;
+    for(i=0;i<n;i++) {
+        v = op1[i];
+        a = v - op2[i];
+        k1 = a > v;
+        v = a - k;
+        k = (v > a) | k1;
+        res[i] = v;
+    }
+    return k;
+}
+
+/* compute 0 - op2. carry = 0 or 1. */
+static js_limb_t mp_neg(js_limb_t *res, const js_limb_t *op2, int n)
+{
+    int i;
+    js_limb_t v, carry;
+
+    carry = 1;
+    for(i=0;i<n;i++) {
+        v = ~op2[i] + carry;
+        carry = v < carry;
+        res[i] = v;
+    }
+    return carry;
+}
+
+/* tabr[] = taba[] * b + l. Return the high carry */
+static js_limb_t mp_mul1(js_limb_t *tabr, const js_limb_t *taba, js_limb_t n,
+                      js_limb_t b, js_limb_t l)
+{
+    js_limb_t i;
+    js_dlimb_t t;
+
+    for(i = 0; i < n; i++) {
+        t = (js_dlimb_t)taba[i] * (js_dlimb_t)b + l;
+        tabr[i] = t;
+        l = t >> JS_LIMB_BITS;
+    }
+    return l;
+}
+
+static js_limb_t mp_div1(js_limb_t *tabr, const js_limb_t *taba, js_limb_t n,
+                      js_limb_t b, js_limb_t r)
+{
+    js_slimb_t i;
+    js_dlimb_t a1;
+    for(i = n - 1; i >= 0; i--) {
+        a1 = ((js_dlimb_t)r << JS_LIMB_BITS) | taba[i];
+        tabr[i] = a1 / b;
+        r = a1 % b;
+    }
+    return r;
+}
+
+/* tabr[] += taba[] * b, return the high word. */
+static js_limb_t mp_add_mul1(js_limb_t *tabr, const js_limb_t *taba, js_limb_t n,
+                          js_limb_t b)
+{
+    js_limb_t i, l;
+    js_dlimb_t t;
+
+    l = 0;
+    for(i = 0; i < n; i++) {
+        t = (js_dlimb_t)taba[i] * (js_dlimb_t)b + l + tabr[i];
+        tabr[i] = t;
+        l = t >> JS_LIMB_BITS;
+    }
+    return l;
+}
+
+/* size of the result : op1_size + op2_size. */
+static void mp_mul_basecase(js_limb_t *result,
+                            const js_limb_t *op1, js_limb_t op1_size,
+                            const js_limb_t *op2, js_limb_t op2_size)
+{
+    int i;
+    js_limb_t r;
+
+    result[op1_size] = mp_mul1(result, op1, op1_size, op2[0], 0);
+    for(i=1;i<op2_size;i++) {
+        r = mp_add_mul1(result + i, op1, op1_size, op2[i]);
+        result[i + op1_size] = r;
+    }
+}
+
+/* tabr[] -= taba[] * b. Return the value to substract to the high
+   word. */
+static js_limb_t mp_sub_mul1(js_limb_t *tabr, const js_limb_t *taba, js_limb_t n,
+                          js_limb_t b)
+{
+    js_limb_t i, l;
+    js_dlimb_t t;
+
+    l = 0;
+    for(i = 0; i < n; i++) {
+        t = tabr[i] - (js_dlimb_t)taba[i] * (js_dlimb_t)b - l;
+        tabr[i] = t;
+        l = -(t >> JS_LIMB_BITS);
+    }
+    return l;
+}
+
+/* WARNING: d must be >= 2^(JS_LIMB_BITS-1) */
+static inline js_limb_t udiv1norm_init(js_limb_t d)
+{
+    js_limb_t a0, a1;
+    a1 = -d - 1;
+    a0 = -1;
+    return (((js_dlimb_t)a1 << JS_LIMB_BITS) | a0) / d;
+}
+
+/* return the quotient and the remainder in '*pr'of 'a1*2^JS_LIMB_BITS+a0
+   / d' with 0 <= a1 < d. */
+static inline js_limb_t udiv1norm(js_limb_t *pr, js_limb_t a1, js_limb_t a0,
+                                js_limb_t d, js_limb_t d_inv)
+{
+    js_limb_t n1m, n_adj, q, r, ah;
+    js_dlimb_t a;
+    n1m = ((js_slimb_t)a0 >> (JS_LIMB_BITS - 1));
+    n_adj = a0 + (n1m & d);
+    a = (js_dlimb_t)d_inv * (a1 - n1m) + n_adj;
+    q = (a >> JS_LIMB_BITS) + a1;
+    /* compute a - q * r and update q so that the remainder is\
+       between 0 and d - 1 */
+    a = ((js_dlimb_t)a1 << JS_LIMB_BITS) | a0;
+    a = a - (js_dlimb_t)q * d - d;
+    ah = a >> JS_LIMB_BITS;
+    q += 1 + ah;
+    r = (js_limb_t)a + (ah & d);
+    *pr = r;
+    return q;
+}
+
+#define UDIV1NORM_THRESHOLD 3
+
+/* b must be >= 1 << (JS_LIMB_BITS - 1) */
+static js_limb_t mp_div1norm(js_limb_t *tabr, const js_limb_t *taba, js_limb_t n,
+                          js_limb_t b, js_limb_t r)
+{
+    js_slimb_t i;
+
+    if (n >= UDIV1NORM_THRESHOLD) {
+        js_limb_t b_inv;
+        b_inv = udiv1norm_init(b);
+        for(i = n - 1; i >= 0; i--) {
+            tabr[i] = udiv1norm(&r, r, taba[i], b, b_inv);
+        }
+    } else {
+        js_dlimb_t a1;
+        for(i = n - 1; i >= 0; i--) {
+            a1 = ((js_dlimb_t)r << JS_LIMB_BITS) | taba[i];
+            tabr[i] = a1 / b;
+            r = a1 % b;
+        }
+    }
+    return r;
+}
+
+/* base case division: divides taba[0..na-1] by tabb[0..nb-1]. tabb[nb
+   - 1] must be >= 1 << (JS_LIMB_BITS - 1). na - nb must be >= 0. 'taba'
+   is modified and contains the remainder (nb limbs). tabq[0..na-nb]
+   contains the quotient with tabq[na - nb] <= 1. */
+static void mp_divnorm(js_limb_t *tabq, js_limb_t *taba, js_limb_t na,
+                       const js_limb_t *tabb, js_limb_t nb)
+{
+    js_limb_t r, a, c, q, v, b1, b1_inv, n, dummy_r;
+    int i, j;
+
+    b1 = tabb[nb - 1];
+    if (nb == 1) {
+        taba[0] = mp_div1norm(tabq, taba, na, b1, 0);
+        return;
+    }
+    n = na - nb;
+
+    if (n >= UDIV1NORM_THRESHOLD)
+        b1_inv = udiv1norm_init(b1);
+    else
+        b1_inv = 0;
+
+    /* first iteration: the quotient is only 0 or 1 */
+    q = 1;
+    for(j = nb - 1; j >= 0; j--) {
+        if (taba[n + j] != tabb[j]) {
+            if (taba[n + j] < tabb[j])
+                q = 0;
+            break;
+        }
+    }
+    tabq[n] = q;
+    if (q) {
+        mp_sub(taba + n, taba + n, tabb, nb, 0);
+    }
+
+    for(i = n - 1; i >= 0; i--) {
+        if (unlikely(taba[i + nb] >= b1)) {
+            q = -1;
+        } else if (b1_inv) {
+            q = udiv1norm(&dummy_r, taba[i + nb], taba[i + nb - 1], b1, b1_inv);
+        } else {
+            js_dlimb_t al;
+            al = ((js_dlimb_t)taba[i + nb] << JS_LIMB_BITS) | taba[i + nb - 1];
+            q = al / b1;
+            r = al % b1;
+        }
+        r = mp_sub_mul1(taba + i, tabb, nb, q);
+
+        v = taba[i + nb];
+        a = v - r;
+        c = (a > v);
+        taba[i + nb] = a;
+
+        if (c != 0) {
+            /* negative result */
+            for(;;) {
+                q--;
+                c = mp_add(taba + i, taba + i, tabb, nb, 0);
+                /* propagate carry and test if positive result */
+                if (c != 0) {
+                    if (++taba[i + nb] == 0) {
+                        break;
+                    }
+                }
+            }
+        }
+        tabq[i] = q;
+    }
+}
+
+/* 1 <= shift <= JS_LIMB_BITS - 1 */
+static js_limb_t mp_shl(js_limb_t *tabr, const js_limb_t *taba, int n,
+                        int shift)
+{
+    int i;
+    js_limb_t l, v;
+    l = 0;
+    for(i = 0; i < n; i++) {
+        v = taba[i];
+        tabr[i] = (v << shift) | l;
+        l = v >> (JS_LIMB_BITS - shift);
+    }
+    return l;
+}
+
+/* r = (a + high*B^n) >> shift. Return the remainder r (0 <= r < 2^shift).
+   1 <= shift <= LIMB_BITS - 1 */
+static js_limb_t mp_shr(js_limb_t *tab_r, const js_limb_t *tab, int n,
+                        int shift, js_limb_t high)
+{
+    int i;
+    js_limb_t l, a;
+
+    l = high;
+    for(i = n - 1; i >= 0; i--) {
+        a = tab[i];
+        tab_r[i] = (a >> shift) | (l << (JS_LIMB_BITS - shift));
+        l = a;
+    }
+    return l & (((js_limb_t)1 << shift) - 1);
+}
+
+static JSBigInt *js_bigint_new(JSContext *ctx, int len)
+{
+    JSBigInt *r;
+    if (len > JS_BIGINT_MAX_SIZE) {
+        JS_ThrowRangeError(ctx, "BigInt is too large to allocate");
+        return NULL;
+    }
+    r = js_malloc(ctx, sizeof(JSBigInt) + len * sizeof(js_limb_t));
+    if (!r)
+        return NULL;
+    r->header.ref_count = 1;
+    r->len = len;
+    return r;
+}
+
+static JSBigInt *js_bigint_set_si(JSBigIntBuf *buf, js_slimb_t a)
+{
+    JSBigInt *r = (JSBigInt *)buf->big_int_buf;
+    r->header.ref_count = 0; /* fail safe */
+    r->len = 1;
+    r->tab[0] = a;
+    return r;
+}
+
+static JSBigInt *js_bigint_set_si64(JSBigIntBuf *buf, int64_t a)
+{
+    JSBigInt *r = (JSBigInt *)buf->big_int_buf;
+    r->header.ref_count = 0; /* fail safe */
+    if (a >= INT32_MIN && a <= INT32_MAX) {
+        r->len = 1;
+        r->tab[0] = a;
+    } else {
+        r->len = 2;
+        r->tab[0] = a;
+        r->tab[1] = a >> JS_LIMB_BITS;
+    }
+    return r;
+}
+
+/* val must be a short big int */
+static JSBigInt *js_bigint_set_short(JSBigIntBuf *buf, JSValueConst val)
+{
+    return js_bigint_set_si(buf, JS_VALUE_GET_SHORT_BIG_INT(val));
+}
+
+static __maybe_unused void js_bigint_dump1(JSContext *ctx, const char *str,
+                                           const js_limb_t *tab, int len)
+{
+    int i;
+    printf("%s: ", str);
+    for(i = len - 1; i >= 0; i--) {
+        printf(" %08x", tab[i]);
+    }
+    printf("\n");
+}
+
+static __maybe_unused void js_bigint_dump(JSContext *ctx, const char *str,
+                                          const JSBigInt *p)
+{
+    js_bigint_dump1(ctx, str, p->tab, p->len);
+}
+
+static JSBigInt *js_bigint_new_si(JSContext *ctx, js_slimb_t a)
+{
+    JSBigInt *r;
+    r = js_bigint_new(ctx, 1);
+    if (!r)
+        return NULL;
+    r->tab[0] = a;
+    return r;
+}
+
+static JSBigInt *js_bigint_new_si64(JSContext *ctx, int64_t a)
+{
+    if (a >= INT32_MIN && a <= INT32_MAX) {
+        return js_bigint_new_si(ctx, a);
+    } else {
+        JSBigInt *r;
+        r = js_bigint_new(ctx, 2);
+        if (!r)
+            return NULL;
+        r->tab[0] = a;
+        r->tab[1] = a >> 32;
+        return r;
+    }
+}
+
+static JSBigInt *js_bigint_new_ui64(JSContext *ctx, uint64_t a)
+{
+    if (a <= INT64_MAX) {
+        return js_bigint_new_si64(ctx, a);
+    } else {
+        JSBigInt *r;
+        r = js_bigint_new(ctx, (65 + JS_LIMB_BITS - 1) / JS_LIMB_BITS);
+        if (!r)
+            return NULL;
+        r->tab[0] = a;
+        r->tab[1] = a >> 32;
+        r->tab[2] = 0;
+        return r;
+    }
+}
+
+static JSBigInt *js_bigint_new_di(JSContext *ctx, js_sdlimb_t a)
+{
+    JSBigInt *r;
+    if (a == (js_slimb_t)a) {
+        r = js_bigint_new(ctx, 1);
+        if (!r)
+            return NULL;
+        r->tab[0] = a;
+    } else {
+        r = js_bigint_new(ctx, 2);
+        if (!r)
+            return NULL;
+        r->tab[0] = a;
+        r->tab[1] = a >> JS_LIMB_BITS;
+    }
+    return r;
+}
+
+/* Remove redundant high order limbs. Warning: 'a' may be
+   reallocated. Can never fail.
+*/
+static JSBigInt *js_bigint_normalize1(JSContext *ctx, JSBigInt *a, int l)
+{
+    js_limb_t v;
+
+    assert(a->header.ref_count == 1);
+    while (l > 1) {
+        v = a->tab[l - 1];
+        if ((v != 0 && v != -1) ||
+            (v & 1) != (a->tab[l - 2] >> (JS_LIMB_BITS - 1))) {
+            break;
+        }
+        l--;
+    }
+    if (l != a->len) {
+        JSBigInt *a1;
+        /* realloc to reduce the size */
+        a->len = l;
+        a1 = js_realloc(ctx, a, sizeof(JSBigInt) + l * sizeof(js_limb_t));
+        if (a1)
+            a = a1;
+    }
+    return a;
+}
+
+static JSBigInt *js_bigint_normalize(JSContext *ctx, JSBigInt *a)
+{
+    return js_bigint_normalize1(ctx, a, a->len);
+}
+
+/* return 0 or 1 depending on the sign */
+static inline int js_bigint_sign(const JSBigInt *a)
+{
+    return a->tab[a->len - 1] >> (JS_LIMB_BITS - 1);
+}
+
+static js_slimb_t js_bigint_get_si_sat(const JSBigInt *a)
+{
+    if (a->len == 1) {
+        return a->tab[0];
+    } else {
+        if (js_bigint_sign(a))
+            return INT32_MIN;
+        else
+            return INT32_MAX;
+    }
+}
+
+/* add the op1 limb */
+static JSBigInt *js_bigint_extend(JSContext *ctx, JSBigInt *r,
+                                  js_limb_t op1)
+{
+    int n2 = r->len;
+    if ((op1 != 0 && op1 != -1) ||
+        (op1 & 1) != r->tab[n2 - 1] >> (JS_LIMB_BITS - 1)) {
+        JSBigInt *r1;
+        r1 = js_realloc(ctx, r,
+                        sizeof(JSBigInt) + (n2 + 1) * sizeof(js_limb_t));
+        if (!r1) {
+            js_free(ctx, r);
+            return NULL;
+        }
+        r = r1;
+        r->len = n2 + 1;
+        r->tab[n2] = op1;
+    } else {
+        /* otherwise still need to normalize the result */
+        r = js_bigint_normalize(ctx, r);
+    }
+    return r;
+}
+
+/* return NULL in case of error. Compute a + b (b_neg = 0) or a - b
+   (b_neg = 1) */
+/* XXX: optimize */
+static JSBigInt *js_bigint_add(JSContext *ctx, const JSBigInt *a,
+                               const JSBigInt *b, int b_neg)
+{
+    JSBigInt *r;
+    int n1, n2, i;
+    js_limb_t carry, op1, op2, a_sign, b_sign;
+
+    n2 = max_int(a->len, b->len);
+    n1 = min_int(a->len, b->len);
+    r = js_bigint_new(ctx, n2);
+    if (!r)
+        return NULL;
+    /* XXX: optimize */
+    /* common part */
+    carry = b_neg;
+    for(i = 0; i < n1; i++) {
+        op1 = a->tab[i];
+        op2 = b->tab[i] ^ (-b_neg);
+        ADDC(r->tab[i], carry, op1, op2, carry);
+    }
+    a_sign = -js_bigint_sign(a);
+    b_sign = (-js_bigint_sign(b)) ^ (-b_neg);
+    /* part with sign extension of one operand  */
+    if (a->len > b->len) {
+        for(i = n1; i < n2; i++) {
+            op1 = a->tab[i];
+            ADDC(r->tab[i], carry, op1, b_sign, carry);
+        }
+    } else if (a->len < b->len) {
+        for(i = n1; i < n2; i++) {
+            op2 = b->tab[i] ^ (-b_neg);
+            ADDC(r->tab[i], carry, a_sign, op2, carry);
+        }
+    }
+
+    /* part with sign extension for both operands. Extend the result
+       if necessary */
+    return js_bigint_extend(ctx, r, a_sign + b_sign + carry);
+}
+
+/* XXX: optimize */
+static JSBigInt *js_bigint_neg(JSContext *ctx, const JSBigInt *a)
+{
+    JSBigIntBuf buf;
+    JSBigInt *b;
+    b = js_bigint_set_si(&buf, 0);
+    return js_bigint_add(ctx, b, a, 1);
+}
+
+static JSBigInt *js_bigint_mul(JSContext *ctx, const JSBigInt *a,
+                               const JSBigInt *b)
+{
+    JSBigInt *r;
+
+    r = js_bigint_new(ctx, a->len + b->len);
+    if (!r)
+        return NULL;
+    mp_mul_basecase(r->tab, a->tab, a->len, b->tab, b->len);
+    /* correct the result if negative operands (no overflow is
+       possible) */
+    if (js_bigint_sign(a))
+        mp_sub(r->tab + a->len, r->tab + a->len, b->tab, b->len, 0);
+    if (js_bigint_sign(b))
+        mp_sub(r->tab + b->len, r->tab + b->len, a->tab, a->len, 0);
+    return js_bigint_normalize(ctx, r);
+}
+
+/* return the division or the remainder. 'b' must be != 0. return NULL
+   in case of exception (division by zero or memory error) */
+static JSBigInt *js_bigint_divrem(JSContext *ctx, const JSBigInt *a,
+                                  const JSBigInt *b, bool is_rem)
+{
+    JSBigInt *r, *q;
+    js_limb_t *tabb, h;
+    int na, nb, a_sign, b_sign, shift;
+
+    if (b->len == 1 && b->tab[0] == 0) {
+        JS_ThrowRangeError(ctx, "BigInt division by zero");
+        return NULL;
+    }
+
+    a_sign = js_bigint_sign(a);
+    b_sign = js_bigint_sign(b);
+    na = a->len;
+    nb = b->len;
+
+    r = js_bigint_new(ctx, na + 2);
+    if (!r)
+        return NULL;
+    if (a_sign) {
+        mp_neg(r->tab, a->tab, na);
+    } else {
+        memcpy(r->tab, a->tab, na * sizeof(a->tab[0]));
+    }
+    /* normalize */
+    while (na > 1 && r->tab[na - 1] == 0)
+        na--;
+
+    tabb = js_malloc(ctx, nb * sizeof(tabb[0]));
+    if (!tabb) {
+        js_free(ctx, r);
+        return NULL;
+    }
+    if (b_sign) {
+        mp_neg(tabb, b->tab, nb);
+    } else {
+        memcpy(tabb, b->tab, nb * sizeof(tabb[0]));
+    }
+    /* normalize */
+    while (nb > 1 && tabb[nb - 1] == 0)
+        nb--;
+
+    /* trivial case if 'a' is small */
+    if (na < nb) {
+        js_free(ctx, r);
+        js_free(ctx, tabb);
+        if (is_rem) {
+            /* r = a */
+            r = js_bigint_new(ctx, a->len);
+            if (!r)
+                return NULL;
+            memcpy(r->tab, a->tab, a->len * sizeof(a->tab[0]));
+            return r;
+        } else {
+            /* q = 0 */
+            return js_bigint_new_si(ctx, 0);
+        }
+    }
+
+    /* normalize 'b' */
+    shift = js_limb_clz(tabb[nb - 1]);
+    if (shift != 0) {
+        mp_shl(tabb, tabb, nb, shift);
+        h = mp_shl(r->tab, r->tab, na, shift);
+        if (h != 0)
+            r->tab[na++] = h;
+    }
+
+    q = js_bigint_new(ctx, na - nb + 2); /* one more limb for the sign */
+    if (!q) {
+        js_free(ctx, r);
+        js_free(ctx, tabb);
+        return NULL;
+    }
+
+    //    js_bigint_dump1(ctx, "a", r->tab, na);
+    //    js_bigint_dump1(ctx, "b", tabb, nb);
+    mp_divnorm(q->tab, r->tab, na, tabb, nb);
+    js_free(ctx, tabb);
+
+    if (is_rem) {
+        js_free(ctx, q);
+        if (shift != 0)
+            mp_shr(r->tab, r->tab, nb, shift, 0);
+        r->tab[nb++] = 0;
+        if (a_sign)
+            mp_neg(r->tab, r->tab, nb);
+        r = js_bigint_normalize1(ctx, r, nb);
+        return r;
+    } else {
+        js_free(ctx, r);
+        q->tab[na - nb + 1] = 0;
+        if (a_sign ^ b_sign) {
+            mp_neg(q->tab, q->tab, q->len);
+        }
+        q = js_bigint_normalize(ctx, q);
+        return q;
+    }
+}
+
+/* and, or, xor */
+static JSBigInt *js_bigint_logic(JSContext *ctx, const JSBigInt *a,
+                                 const JSBigInt *b, OPCodeEnum op)
+{
+    JSBigInt *r;
+    js_limb_t b_sign;
+    int a_len, b_len, i;
+
+    if (a->len < b->len) {
+        const JSBigInt *tmp;
+        tmp = a;
+        a = b;
+        b = tmp;
+    }
+    /* a_len >= b_len */
+    a_len = a->len;
+    b_len = b->len;
+    b_sign = -js_bigint_sign(b);
+
+    r = js_bigint_new(ctx, a_len);
+    if (!r)
+        return NULL;
+    switch(op) {
+    case OP_or:
+        for(i = 0; i < b_len; i++) {
+            r->tab[i] = a->tab[i] | b->tab[i];
+        }
+        for(i = b_len; i < a_len; i++) {
+            r->tab[i] = a->tab[i] | b_sign;
+        }
+        break;
+    case OP_and:
+        for(i = 0; i < b_len; i++) {
+            r->tab[i] = a->tab[i] & b->tab[i];
+        }
+        for(i = b_len; i < a_len; i++) {
+            r->tab[i] = a->tab[i] & b_sign;
+        }
+        break;
+    case OP_xor:
+        for(i = 0; i < b_len; i++) {
+            r->tab[i] = a->tab[i] ^ b->tab[i];
+        }
+        for(i = b_len; i < a_len; i++) {
+            r->tab[i] = a->tab[i] ^ b_sign;
+        }
+        break;
+    default:
+        abort();
+    }
+    return js_bigint_normalize(ctx, r);
+}
+
+static JSBigInt *js_bigint_not(JSContext *ctx, const JSBigInt *a)
+{
+    JSBigInt *r;
+    int i;
+
+    r = js_bigint_new(ctx, a->len);
+    if (!r)
+        return NULL;
+    for(i = 0; i < a->len; i++) {
+        r->tab[i] = ~a->tab[i];
+    }
+    /* no normalization is needed */
+    return r;
+}
+
+static JSBigInt *js_bigint_shl(JSContext *ctx, const JSBigInt *a,
+                               unsigned int shift1)
+{
+    int d, i, shift;
+    JSBigInt *r;
+    js_limb_t l;
+
+    if (a->len == 1 && a->tab[0] == 0)
+        return js_bigint_new_si(ctx, 0); /* zero case */
+    d = shift1 / JS_LIMB_BITS;
+    shift = shift1 % JS_LIMB_BITS;
+    r = js_bigint_new(ctx, a->len + d);
+    if (!r)
+        return NULL;
+    for(i = 0; i < d; i++)
+        r->tab[i] = 0;
+    if (shift == 0) {
+        for(i = 0; i < a->len; i++) {
+            r->tab[i + d] = a->tab[i];
+        }
+    } else {
+        l = mp_shl(r->tab + d, a->tab, a->len, shift);
+        if (js_bigint_sign(a))
+            l |= (js_limb_t)(-1) << shift;
+        r = js_bigint_extend(ctx, r, l);
+    }
+    return r;
+}
+
+static JSBigInt *js_bigint_shr(JSContext *ctx, const JSBigInt *a,
+                               unsigned int shift1)
+{
+    int d, i, shift, a_sign, n1;
+    JSBigInt *r;
+
+    d = shift1 / JS_LIMB_BITS;
+    shift = shift1 % JS_LIMB_BITS;
+    a_sign = js_bigint_sign(a);
+    if (d >= a->len)
+        return js_bigint_new_si(ctx, -a_sign);
+    n1 = a->len - d;
+    r = js_bigint_new(ctx, n1);
+    if (!r)
+        return NULL;
+    if (shift == 0) {
+        for(i = 0; i < n1; i++) {
+            r->tab[i] = a->tab[i + d];
+        }
+        /* no normalization is needed */
+    } else {
+        mp_shr(r->tab, a->tab + d, n1, shift, -a_sign);
+        r = js_bigint_normalize(ctx, r);
+    }
+    return r;
+}
+
+static JSBigInt *js_bigint_pow(JSContext *ctx, const JSBigInt *a, JSBigInt *b)
+{
+    uint32_t e;
+    int n_bits, i;
+    JSBigInt *r, *r1;
+
+    /* b must be >= 0 */
+    if (js_bigint_sign(b)) {
+        JS_ThrowRangeError(ctx, "BigInt negative exponent");
+        return NULL;
+    }
+    if (b->len == 1 && b->tab[0] == 0) {
+        /* a^0 = 1 */
+        return js_bigint_new_si(ctx, 1);
+    } else if (a->len == 1) {
+        js_limb_t v;
+        bool is_neg;
+
+        v = a->tab[0];
+        if (v <= 1)
+            return js_bigint_new_si(ctx, v);
+        else if (v == -1)
+            return js_bigint_new_si(ctx, 1 - 2 * (b->tab[0] & 1));
+        is_neg = (js_slimb_t)v < 0;
+        if (is_neg)
+            v = -v;
+        if ((v & (v - 1)) == 0) {
+            uint64_t e1;
+            int n;
+            /* v = 2^n */
+            n = JS_LIMB_BITS - 1 - js_limb_clz(v);
+            if (b->len > 1)
+                goto overflow;
+            if (b->tab[0] > INT32_MAX)
+                goto overflow;
+            e = b->tab[0];
+            e1 = (uint64_t)e * n;
+            if (e1 > JS_BIGINT_MAX_SIZE * JS_LIMB_BITS)
+                goto overflow;
+            e = e1;
+            if (is_neg)
+                is_neg = b->tab[0] & 1;
+            r = js_bigint_new(ctx,
+                              (e + JS_LIMB_BITS + 1 - is_neg) / JS_LIMB_BITS);
+            if (!r)
+                return NULL;
+            memset(r->tab, 0, sizeof(r->tab[0]) * r->len);
+            r->tab[e / JS_LIMB_BITS] =
+                (js_limb_t)(1 - 2 * is_neg) << (e % JS_LIMB_BITS);
+            return r;
+        }
+    }
+    if (b->len > 1)
+        goto overflow;
+    if (b->tab[0] > INT32_MAX)
+        goto overflow;
+    e = b->tab[0];
+    n_bits = 32 - clz32(e);
+
+    r = js_bigint_new(ctx, a->len);
+    if (!r)
+        return NULL;
+    memcpy(r->tab, a->tab, a->len * sizeof(a->tab[0]));
+    for(i = n_bits - 2; i >= 0; i--) {
+        r1 = js_bigint_mul(ctx, r, r);
+        if (!r1)
+            return NULL;
+        js_free(ctx, r);
+        r = r1;
+        if ((e >> i) & 1) {
+            r1 = js_bigint_mul(ctx, r, a);
+            if (!r1)
+                return NULL;
+            js_free(ctx, r);
+            r = r1;
+        }
+    }
+    return r;
+ overflow:
+    JS_ThrowRangeError(ctx, "BigInt is too large");
+    return NULL;
+}
+
+/* return (mant, exp) so that abs(a) ~ mant*2^(exp - (limb_bits -
+   1). a must be != 0. */
+static uint64_t js_bigint_get_mant_exp(JSContext *ctx,
+                                       int *pexp, const JSBigInt *a)
+{
+    js_limb_t t[4 - JS_LIMB_BITS / 32], carry, v, low_bits;
+    int n1, n2, sgn, shift, i, j, e;
+    uint64_t a1, a0;
+
+    n2 = 4 - JS_LIMB_BITS / 32;
+    n1 = a->len - n2;
+    sgn = js_bigint_sign(a);
+
+    /* low_bits != 0 if there are a non zero low bit in abs(a) */
+    low_bits = 0;
+    carry = sgn;
+    for(i = 0; i < n1; i++) {
+        v = (a->tab[i] ^ (-sgn)) + carry;
+        carry = v < carry;
+        low_bits |= v;
+    }
+    /* get the n2 high limbs of abs(a) */
+    for(j = 0; j < n2; j++) {
+        i = j + n1;
+        if (i < 0) {
+            v = 0;
+        } else {
+            v = (a->tab[i] ^ (-sgn)) + carry;
+            carry = v < carry;
+        }
+        t[j] = v;
+    }
+
+    a1 = ((uint64_t)t[2] << 32) | t[1];
+    a0 = (uint64_t)t[0] << 32;
+    a0 |= (low_bits != 0);
+    /* normalize */
+    {
+        shift = clz64(a1);
+        if (shift != 0) {
+            a1 = (a1 << shift) | (a0 >> (64 - shift));
+            a0 <<= shift;
+        }
+    }
+    a1 |= (a0 != 0); /* keep the bits for the final rounding */
+    /* compute the exponent */
+    e = a->len * JS_LIMB_BITS - shift - 1;
+    *pexp = e;
+    return a1;
+}
+
+/* shift left with round to nearest, ties to even. n >= 1 */
+static uint64_t shr_rndn(uint64_t a, int n)
+{
+    uint64_t addend = ((a >> n) & 1) + ((1 << (n - 1)) - 1);
+    return (a + addend) >> n;
+}
+
+/* convert to float64 with round to nearest, ties to even. Return
+   +/-infinity if too large. */
+static double js_bigint_to_float64(JSContext *ctx, const JSBigInt *a)
+{
+    int sgn, e;
+    uint64_t mant;
+
+    if (a->len == 1) {
+        /* fast case, including zero */
+        return (double)(js_slimb_t)a->tab[0];
+    }
+
+    sgn = js_bigint_sign(a);
+    mant = js_bigint_get_mant_exp(ctx, &e, a);
+    if (e > 1023) {
+        /* overflow: return infinity */
+        mant = 0;
+        e = 1024;
+    } else {
+        mant = (mant >> 1) | (mant & 1); /* avoid overflow in rounding */
+        mant = shr_rndn(mant, 10);
+        /* rounding can cause an overflow */
+        if (mant >= ((uint64_t)1 << 53)) {
+            mant >>= 1;
+            e++;
+        }
+        mant &= (((uint64_t)1 << 52) - 1);
+    }
+    return uint64_as_float64(((uint64_t)sgn << 63) |
+                             ((uint64_t)(e + 1023) << 52) |
+                             mant);
+}
+
+/* return (1, NULL) if not an integer, (2, NULL) if NaN or Infinity,
+   (0, n) if an integer, (0, NULL) in case of memory error */
+static JSBigInt *js_bigint_from_float64(JSContext *ctx, int *pres, double a1)
+{
+    uint64_t a = float64_as_uint64(a1);
+    int sgn, e, shift;
+    uint64_t mant;
+    JSBigIntBuf buf;
+    JSBigInt *r;
+
+    sgn = a >> 63;
+    e = (a >> 52) & ((1 << 11) - 1);
+    mant = a & (((uint64_t)1 << 52) - 1);
+    if (e == 2047) {
+        /* NaN, Infinity */
+        *pres = 2;
+        return NULL;
+    }
+    if (e == 0 && mant == 0) {
+        /* zero */
+        *pres = 0;
+        return js_bigint_new_si(ctx, 0);
+    }
+    e -= 1023;
+    /* 0 < a < 1 : not an integer */
+    if (e < 0)
+        goto not_an_integer;
+    mant |= (uint64_t)1 << 52;
+    if (e < 52) {
+        shift = 52 - e;
+        /* check that there is no fractional part */
+        if (mant & (((uint64_t)1 << shift) - 1)) {
+        not_an_integer:
+            *pres = 1;
+            return NULL;
+        }
+        mant >>= shift;
+        e = 0;
+    } else {
+        e -= 52;
+    }
+    if (sgn)
+        mant = -mant;
+    /* the integer is mant*2^e */
+    r = js_bigint_set_si64(&buf, (int64_t)mant);
+    *pres = 0;
+    return js_bigint_shl(ctx, r, e);
+}
+
+/* return -1, 0, 1 or (2) (unordered) */
+static int js_bigint_float64_cmp(JSContext *ctx, const JSBigInt *a,
+                                 double b)
+{
+    int b_sign, a_sign, e, f;
+    uint64_t mant, b1, a_mant;
+
+    b1 = float64_as_uint64(b);
+    b_sign = b1 >> 63;
+    e = (b1 >> 52) & ((1 << 11) - 1);
+    mant = b1 & (((uint64_t)1 << 52) - 1);
+    a_sign = js_bigint_sign(a);
+    if (e == 2047) {
+        if (mant != 0) {
+            /* NaN */
+            return 2;
+        } else {
+            /* +/- infinity */
+            return 2 * b_sign - 1;
+        }
+    } else if (e == 0 && mant == 0) {
+        /* b = +/-0 */
+        if (a->len == 1 && a->tab[0] == 0)
+            return 0;
+        else
+            return 1 - 2 * a_sign;
+    } else if (a->len == 1 && a->tab[0] == 0) {
+        /* a = 0, b != 0 */
+        return 2 * b_sign - 1;
+    } else if (a_sign != b_sign) {
+        return 1 - 2 * a_sign;
+    } else {
+        e -= 1023;
+        /* Note: handling denormals is not necessary because we
+           compare to integers hence f >= 0 */
+        /* compute f so that 2^f <= abs(a) < 2^(f+1) */
+        a_mant = js_bigint_get_mant_exp(ctx, &f, a);
+        if (f != e) {
+            if (f < e)
+                return -1;
+            else
+                return 1;
+        } else {
+            mant = (mant | ((uint64_t)1 << 52)) << 11; /* align to a_mant */
+            if (a_mant < mant)
+                return 2 * a_sign - 1;
+            else if (a_mant > mant)
+                return 1 - 2 * a_sign;
+            else
+                return 0;
+        }
+    }
+}
+
+/* return -1, 0 or 1 */
+static int js_bigint_cmp(JSContext *ctx, const JSBigInt *a,
+                         const JSBigInt *b)
+{
+    int a_sign, b_sign, res, i;
+    a_sign = js_bigint_sign(a);
+    b_sign = js_bigint_sign(b);
+    if (a_sign != b_sign) {
+        res = 1 - 2 * a_sign;
+    } else {
+        /* we assume the numbers are normalized */
+        if (a->len != b->len) {
+            if (a->len < b->len)
+                res = 2 * a_sign - 1;
+            else
+                res = 1 - 2 * a_sign;
+        } else {
+            res = 0;
+            for(i = a->len -1; i >= 0; i--) {
+                if (a->tab[i] != b->tab[i]) {
+                    if (a->tab[i] < b->tab[i])
+                        res = -1;
+                    else
+                        res = 1;
+                    break;
+                }
+            }
+        }
+    }
+    return res;
+}
+
+/* contains 10^i */
+static const js_limb_t js_pow_dec[JS_LIMB_DIGITS + 1] = {
+    1U,
+    10U,
+    100U,
+    1000U,
+    10000U,
+    100000U,
+    1000000U,
+    10000000U,
+    100000000U,
+    1000000000U,
+};
+
+/* syntax: [-]digits in base radix. Return NULL if memory error. radix
+   = 10, 2, 8 or 16. */
+static JSBigInt *js_bigint_from_string(JSContext *ctx,
+                                       const char *str, int radix)
+{
+    const char *p = str;
+    int is_neg, n_digits, n_limbs, len, log2_radix, n_bits, i;
+    JSBigInt *r;
+    js_limb_t v, c, h;
+
+    is_neg = 0;
+    if (*p == '-') {
+        is_neg = 1;
+        p++;
+    }
+    while (*p == '0')
+        p++;
+    n_digits = strlen(p);
+    log2_radix = 32 - clz32(radix - 1); /* ceil(log2(radix)) */
+    /* compute the maximum number of limbs */
+    /* XXX: overflow */
+    if (radix == 10) {
+        n_bits = (n_digits * 27 + 7) / 8; /* >= ceil(n_digits * log2(10)) */
+    } else {
+        n_bits = n_digits * log2_radix;
+    }
+    /* we add one extra bit for the sign */
+    n_limbs = max_int(1, n_bits / JS_LIMB_BITS + 1);
+    r = js_bigint_new(ctx, n_limbs);
+    if (!r)
+        return NULL;
+    if (radix == 10) {
+        int digits_per_limb = JS_LIMB_DIGITS;
+        len = 1;
+        r->tab[0] = 0;
+        for(;;) {
+            /* XXX: slow */
+            v = 0;
+            for(i = 0; i < digits_per_limb; i++) {
+                c = to_digit(*p);
+                if (c >= radix)
+                    break;
+                p++;
+                v = v * 10 + c;
+            }
+            if (i == 0)
+                break;
+            if (len == 1 && r->tab[0] == 0) {
+                r->tab[0] = v;
+            } else {
+                h = mp_mul1(r->tab, r->tab, len, js_pow_dec[i], v);
+                if (h != 0) {
+                    r->tab[len++] = h;
+                }
+            }
+        }
+        /* add one extra limb to have the correct sign*/
+        if ((r->tab[len - 1] >> (JS_LIMB_BITS - 1)) != 0)
+            r->tab[len++] = 0;
+        r->len = len;
+    } else {
+        unsigned int bit_pos, shift, pos;
+
+        /* power of two base: no multiplication is needed */
+        r->len = n_limbs;
+        memset(r->tab, 0, sizeof(r->tab[0]) * n_limbs);
+        for(i = 0; i < n_digits; i++) {
+            c = to_digit(p[n_digits - 1 - i]);
+            assert(c < radix);
+            bit_pos = i * log2_radix;
+            shift = bit_pos & (JS_LIMB_BITS - 1);
+            pos = bit_pos / JS_LIMB_BITS;
+            r->tab[pos] |= c << shift;
+            /* if log2_radix does not divide JS_LIMB_BITS, needed an
+               additional op */
+            if (shift + log2_radix > JS_LIMB_BITS) {
+                r->tab[pos + 1] |= c >> (JS_LIMB_BITS - shift);
+            }
+        }
+    }
+    r = js_bigint_normalize(ctx, r);
+    /* XXX: could do it in place */
+    if (is_neg) {
+        JSBigInt *r1;
+        r1 = js_bigint_neg(ctx, r);
+        js_free(ctx, r);
+        r = r1;
+    }
+    return r;
+}
+
+/* 2 <= base <= 36 */
+static char const digits[36] = {
+    '0','1','2','3','4','5','6','7','8','9',
+    'a','b','c','d','e','f','g','h','i','j',
+    'k','l','m','n','o','p','q','r','s','t',
+    'u','v','w','x','y','z'
+};
+
+/* special version going backwards */
+/* XXX: use dtoa.c */
+static char *js_u64toa(char *q, int64_t n, unsigned int base)
+{
+    int digit;
+    if (base == 10) {
+        /* division by known base uses multiplication */
+        do {
+            digit = (uint64_t)n % 10;
+            n = (uint64_t)n / 10;
+            *--q = '0' + digit;
+        } while (n != 0);
+    } else {
+        do {
+            digit = (uint64_t)n % base;
+            n = (uint64_t)n / base;
+            *--q = digits[digit];
+        } while (n != 0);
+    }
+    return q;
+}
+
+/* len >= 1. 2 <= radix <= 36 */
+static char *limb_to_a(char *q, js_limb_t n, unsigned int radix, int len)
+{
+    int digit, i;
+
+    if (radix == 10) {
+        /* specific case with constant divisor */
+        /* XXX: optimize */
+        for(i = 0; i < len; i++) {
+            digit = (js_limb_t)n % 10;
+            n = (js_limb_t)n / 10;
+            *--q = digit + '0';
+        }
+    } else {
+        for(i = 0; i < len; i++) {
+            digit = (js_limb_t)n % radix;
+            n = (js_limb_t)n / radix;
+            *--q = digits[digit];
+        }
+    }
+    return q;
+}
+
+#define JS_RADIX_MAX 36
+
+static const uint8_t digits_per_limb_table[JS_RADIX_MAX - 1] = {
+32,20,16,13,12,11,10,10, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+};
+
+static const js_limb_t radix_base_table[JS_RADIX_MAX - 1] = {
+ 0x00000000, 0xcfd41b91, 0x00000000, 0x48c27395,
+ 0x81bf1000, 0x75db9c97, 0x40000000, 0xcfd41b91,
+ 0x3b9aca00, 0x8c8b6d2b, 0x19a10000, 0x309f1021,
+ 0x57f6c100, 0x98c29b81, 0x00000000, 0x18754571,
+ 0x247dbc80, 0x3547667b, 0x4c4b4000, 0x6b5a6e1d,
+ 0x94ace180, 0xcaf18367, 0x0b640000, 0x0e8d4a51,
+ 0x1269ae40, 0x17179149, 0x1cb91000, 0x23744899,
+ 0x2b73a840, 0x34e63b41, 0x40000000, 0x4cfa3cc1,
+ 0x5c13d840, 0x6d91b519, 0x81bf1000,
+};
+
+static JSValue js_bigint_to_string1(JSContext *ctx, JSValueConst val, int radix)
+{
+    if (JS_VALUE_GET_TAG(val) == JS_TAG_SHORT_BIG_INT) {
+        char buf[66];
+        int len;
+        len = i64toa_radix(buf, JS_VALUE_GET_SHORT_BIG_INT(val), radix);
+        return js_new_string8_len(ctx, buf, len);
+    } else {
+        JSBigInt *r, *tmp = NULL;
+        char *buf, *q, *buf_end;
+        int is_neg, n_bits, log2_radix, n_digits;
+        bool is_binary_radix;
+        JSValue res;
+
+        assert(JS_VALUE_GET_TAG(val) == JS_TAG_BIG_INT);
+        r = JS_VALUE_GET_PTR(val);
+        if (r->len == 1 && r->tab[0] == 0) {
+            /* '0' case */
+            return js_new_string8_len(ctx, "0", 1);
+        }
+        is_binary_radix = ((radix & (radix - 1)) == 0);
+        is_neg = js_bigint_sign(r);
+        if (is_neg) {
+            tmp = js_bigint_neg(ctx, r);
+            if (!tmp)
+                return JS_EXCEPTION;
+            r = tmp;
+        } else if (!is_binary_radix) {
+            /* need to modify 'r' */
+            tmp = js_bigint_new(ctx, r->len);
+            if (!tmp)
+                return JS_EXCEPTION;
+            memcpy(tmp->tab, r->tab, r->len * sizeof(r->tab[0]));
+            r = tmp;
+        }
+        log2_radix = 31 - clz32(radix); /* floor(log2(radix)) */
+        n_bits = r->len * JS_LIMB_BITS - js_limb_clz(r->tab[r->len - 1]);
+        /* n_digits is exact only if radix is a power of
+           two. Otherwise it is >= the exact number of digits */
+        n_digits = (n_bits + log2_radix - 1) / log2_radix;
+        /* XXX: could directly build the JSString */
+        buf = js_malloc(ctx, n_digits + is_neg + 1);
+        if (!buf) {
+            js_free(ctx, tmp);
+            return JS_EXCEPTION;
+        }
+        q = buf + n_digits + is_neg + 1;
+        *--q = '\0';
+        buf_end = q;
+        if (!is_binary_radix) {
+            int len;
+            js_limb_t radix_base, v;
+            radix_base = radix_base_table[radix - 2];
+            len = r->len;
+            for(;;) {
+                /* remove leading zero limbs */
+                while (len > 1 && r->tab[len - 1] == 0)
+                    len--;
+                if (len == 1 && r->tab[0] < radix_base) {
+                    v = r->tab[0];
+                    if (v != 0) {
+                        q = js_u64toa(q, v, radix);
+                    }
+                    break;
+                } else {
+                    v = mp_div1(r->tab, r->tab, len, radix_base, 0);
+                    q = limb_to_a(q, v, radix, digits_per_limb_table[radix - 2]);
+                }
+            }
+        } else {
+            int i, shift;
+            unsigned int bit_pos, pos, c;
+
+            /* radix is a power of two */
+            for(i = 0; i < n_digits; i++) {
+                bit_pos = i * log2_radix;
+                pos = bit_pos / JS_LIMB_BITS;
+                shift = bit_pos % JS_LIMB_BITS;
+                if (likely((shift + log2_radix) <= JS_LIMB_BITS)) {
+                    c = r->tab[pos] >> shift;
+                } else {
+                    c = (r->tab[pos] >> shift) |
+                        (r->tab[pos + 1] << (JS_LIMB_BITS - shift));
+                }
+                c &= (radix - 1);
+                *--q = digits[c];
+            }
+        }
+        if (is_neg)
+            *--q = '-';
+        js_free(ctx, tmp);
+        res = js_new_string8_len(ctx, q, buf_end - q);
+        js_free(ctx, buf);
+        return res;
+    }
+}
+
+/* if possible transform a BigInt to short big and free it, otherwise
+   return a normal bigint */
+static JSValue JS_CompactBigInt(JSContext *ctx, JSBigInt *p)
+{
+    JSValue res;
+    if (p->len == 1) {
+        res = __JS_NewShortBigInt(ctx, (js_slimb_t)p->tab[0]);
+        js_free(ctx, p);
+        return res;
+    } else {
+        return JS_MKPTR(JS_TAG_BIG_INT, p);
+    }
+}
+
 /* XXX: remove */
 static double js_strtod(const char *str, int radix, bool is_float)
 {
@@ -10536,7 +12052,10 @@ static double js_strtod(const char *str, int radix, bool is_float)
             n_max = ((uint64_t)-1 - (radix - 1)) / radix;
         /* XXX: could be more precise */
         int_exp = 0;
-        while ((c = to_digit(*p)) < radix) {
+        while (*p != '\0') {
+            c = to_digit((uint8_t)*p);
+            if (c >= radix)
+                break;
             if (n <= n_max) {
                 n = n * radix + c;
             } else {
@@ -10559,23 +12078,6 @@ static double js_strtod(const char *str, int radix, bool is_float)
     return d;
 }
 
-static JSValue js_string_to_bigint(JSContext *ctx, const char *buf, int radix)
-{
-    bf_t *a;
-    int ret;
-    JSValue val;
-    val = JS_NewBigInt(ctx);
-    if (JS_IsException(val))
-        return val;
-    a = JS_GetBigInt(val);
-    ret = bf_atof(a, buf, NULL, radix, BF_PREC_INF, BF_RNDZ);
-    if (ret & BF_ST_MEM_ERROR) {
-        JS_FreeValue(ctx, val);
-        return JS_ThrowOutOfMemory(ctx);
-    }
-    return JS_CompactBigInt1(ctx, val);
-}
-
 /* `js_atof(ctx, p, len, pp, radix, flags)`
    Convert the string pointed to by `p` to a number value.
    Return an exception in case of memory error.
@@ -10718,8 +12220,15 @@ static JSValue js_atof(JSContext *ctx, const char *p, size_t len,
     }
 
     if (flags & ATOD_WANT_BIG_INT) {
-        if (!is_float)
-            val = js_string_to_bigint(ctx, buf, radix);
+        JSBigInt *r;
+        if (!is_float) {
+            r = js_bigint_from_string(ctx, buf, radix);
+            if (!r) {
+                val = JS_ThrowOutOfMemory(ctx);
+                goto done;
+            }
+            val = JS_CompactBigInt(ctx, r);
+        }
     } else {
         d = js_strtod(buf, radix, is_float);
         val = js_number(d);     /* return int or float64 */
@@ -10755,6 +12264,7 @@ static JSValue JS_ToNumberHintFree(JSContext *ctx, JSValue val,
     tag = JS_VALUE_GET_NORM_TAG(val);
     switch(tag) {
     case JS_TAG_BIG_INT:
+    case JS_TAG_SHORT_BIG_INT:
         if (flag != TON_FLAG_NUMERIC) {
             JS_FreeValue(ctx, val);
             return JS_ThrowTypeError(ctx, "cannot convert BigInt to number");
@@ -10788,6 +12298,7 @@ static JSValue JS_ToNumberHintFree(JSContext *ctx, JSValue val,
             JS_FreeValue(ctx, val);
             if (!str)
                 return JS_EXCEPTION;
+            // TODO(saghul): Sync with bellard/quickjs ?
             flags = ATOD_TRIM_SPACES | ATOD_ACCEPT_EMPTY |
                 ATOD_ACCEPT_FLOAT | ATOD_ACCEPT_INFINITY |
                 ATOD_ACCEPT_HEX_PREFIX | ATOD_ACCEPT_BIN_OCT |
@@ -10829,10 +12340,8 @@ static __exception int __JS_ToFloat64Free(JSContext *ctx, double *pres,
     uint32_t tag;
 
     val = JS_ToNumberFree(ctx, val);
-    if (JS_IsException(val)) {
-        *pres = NAN;
-        return -1;
-    }
+    if (JS_IsException(val))
+        goto fail;
     tag = JS_VALUE_GET_NORM_TAG(val);
     switch(tag) {
     case JS_TAG_INT:
@@ -10841,21 +12350,14 @@ static __exception int __JS_ToFloat64Free(JSContext *ctx, double *pres,
     case JS_TAG_FLOAT64:
         d = JS_VALUE_GET_FLOAT64(val);
         break;
-    case JS_TAG_BIG_INT:
-        {
-            JSBigInt *p = JS_VALUE_GET_PTR(val);
-            /* XXX: there can be a double rounding issue with some
-               primitives (such as JS_ToUint8ClampFree()), but it is
-               not critical to fix it. */
-            bf_get_float64(&p->num, &d, BF_RNDN);
-            JS_FreeValue(ctx, val);
-        }
-        break;
     default:
         abort();
     }
     *pres = d;
     return 0;
+fail:
+    *pres = NAN;
+    return -1;
 }
 
 static inline int JS_ToFloat64Free(JSContext *ctx, double *pres, JSValue val)
@@ -11239,21 +12741,6 @@ static __exception int JS_ToArrayLengthFree(JSContext *ctx, uint32_t *plen,
             len = v;
         }
         break;
-    case JS_TAG_BIG_INT:
-        {
-            JSBigInt *p = JS_VALUE_GET_PTR(val);
-            bf_t a;
-            bool res;
-            bf_get_int32((int32_t *)&len, &p->num, BF_GET_INT_MOD);
-            bf_init(ctx->bf_ctx, &a);
-            bf_set_ui(&a, len);
-            res = bf_cmp_eq(&a, &p->num);
-            bf_delete(&a);
-            JS_FreeValue(ctx, val);
-            if (!res)
-                goto fail;
-        }
-        break;
     default:
         if (JS_TAG_IS_FLOAT64(tag)) {
             double d;
@@ -11359,42 +12846,18 @@ static bool JS_NumberIsNegativeOrMinusZero(JSContext *ctx, JSValueConst val)
             u.d = JS_VALUE_GET_FLOAT64(val);
             return (u.u64 >> 63);
         }
+    case JS_TAG_SHORT_BIG_INT:
+        return (JS_VALUE_GET_SHORT_BIG_INT(val) < 0);
     case JS_TAG_BIG_INT:
         {
             JSBigInt *p = JS_VALUE_GET_PTR(val);
-            /* Note: integer zeros are not necessarily positive */
-            return p->num.sign && !bf_is_zero(&p->num);
+            return js_bigint_sign(p);
         }
     default:
         return false;
     }
 }
 
-static JSValue js_bigint_to_string1(JSContext *ctx, JSValueConst val, int radix)
-{
-    JSValue ret;
-    bf_t a_s, *a;
-    char *str;
-    int saved_sign;
-    size_t len;
-
-    a = JS_ToBigInt(ctx, &a_s, val);
-    if (!a)
-        return JS_EXCEPTION;
-    saved_sign = a->sign;
-    if (a->expn == BF_EXP_ZERO)
-        a->sign = 0;
-    str = bf_ftoa(&len, a, radix, 0, BF_RNDZ | BF_FTOA_FORMAT_FRAC |
-                  BF_FTOA_JS_QUIRKS);
-    a->sign = saved_sign;
-    JS_FreeBigInt(ctx, a, &a_s);
-    if (!str)
-        return JS_ThrowOutOfMemory(ctx);
-    ret = js_new_string8_len(ctx, str, len);
-    bf_free(ctx->bf_ctx, str);
-    return ret;
-}
-
 static JSValue js_bigint_to_string(JSContext *ctx, JSValueConst val)
 {
     return js_bigint_to_string1(ctx, val, 10);
@@ -11867,6 +13330,7 @@ JSValue JS_ToStringInternal(JSContext *ctx, JSValueConst val,
         }
     case JS_TAG_FLOAT64:
         return js_dtoa(ctx, JS_VALUE_GET_FLOAT64(val), 0, JS_DTOA_TOSTRING);
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         return js_bigint_to_string(ctx, val);
     case JS_TAG_UNINITIALIZED:
@@ -12152,14 +13616,26 @@ static __maybe_unused void JS_DumpValue(JSRuntime *rt, JSValueConst val)
     case JS_TAG_FLOAT64:
         printf("%.14g", JS_VALUE_GET_FLOAT64(val));
         break;
+    case JS_TAG_SHORT_BIG_INT:
+        printf("%" PRId64 "n", (int64_t)JS_VALUE_GET_SHORT_BIG_INT(val));
+        break;
     case JS_TAG_BIG_INT:
         {
             JSBigInt *p = JS_VALUE_GET_PTR(val);
-            char *str;
-            str = bf_ftoa(NULL, &p->num, 10, 0,
-                          BF_RNDZ | BF_FTOA_FORMAT_FRAC);
-            printf("%sn", str);
-            bf_realloc(&rt->bf_ctx, str, 0);
+            int sgn, i;
+            /* In order to avoid allocations we just dump the limbs */
+            sgn = js_bigint_sign(p);
+            if (sgn)
+                printf("BigInt.asIntN(%d,", p->len * JS_LIMB_BITS);
+            printf("0x");
+            for(i = p->len - 1; i >= 0; i--) {
+                if (i != p->len - 1)
+                    printf("_");
+                printf("%08x", p->tab[i]);
+            }
+            printf("n");
+            if (sgn)
+                printf(")");
         }
         break;
     case JS_TAG_STRING:
@@ -12242,76 +13718,28 @@ static double js_math_pow(double a, double b)
 
 JSValue JS_NewBigInt64(JSContext *ctx, int64_t v)
 {
-    JSValue val;
-    bf_t *a;
-    val = JS_NewBigInt(ctx);
-    if (JS_IsException(val))
-        return val;
-    a = JS_GetBigInt(val);
-    if (bf_set_si(a, v)) {
-        JS_FreeValue(ctx, val);
-        return JS_ThrowOutOfMemory(ctx);
+    if (v >= JS_SHORT_BIG_INT_MIN && v <= JS_SHORT_BIG_INT_MAX) {
+        return __JS_NewShortBigInt(ctx, v);
+    } else {
+        JSBigInt *p;
+        p = js_bigint_new_si64(ctx, v);
+        if (!p)
+            return JS_EXCEPTION;
+        return JS_MKPTR(JS_TAG_BIG_INT, p);
     }
-    return val;
 }
 
 JSValue JS_NewBigUint64(JSContext *ctx, uint64_t v)
 {
-    JSValue val;
-    bf_t *a;
-    val = JS_NewBigInt(ctx);
-    if (JS_IsException(val))
-        return val;
-    a = JS_GetBigInt(val);
-    if (bf_set_ui(a, v)) {
-        JS_FreeValue(ctx, val);
-        return JS_ThrowOutOfMemory(ctx);
-    }
-
-    return val;
-}
-
-/* if the returned bigint is allocated it is equal to
-   'buf'. Otherwise it is a pointer to the bigint in 'val'. Return
-   NULL in case of error. */
-// TODO(bnoordhuis) Merge with JS_ToBigInt()
-static bf_t *JS_ToBigInt1(JSContext *ctx, bf_t *buf, JSValueConst val)
-{
-    uint32_t tag;
-    bf_t *r;
-    JSBigInt *p;
-
-    tag = JS_VALUE_GET_NORM_TAG(val);
-    switch(tag) {
-    case JS_TAG_INT:
-    case JS_TAG_BOOL:
-    case JS_TAG_NULL:
-        r = buf;
-        bf_init(ctx->bf_ctx, r);
-        if (bf_set_si(r, JS_VALUE_GET_INT(val)))
-            goto fail;
-        break;
-    case JS_TAG_FLOAT64:
-        r = buf;
-        bf_init(ctx->bf_ctx, r);
-        if (bf_set_float64(r, JS_VALUE_GET_FLOAT64(val))) {
-        fail:
-            bf_delete(r);
-            return NULL;
-        }
-        break;
-    case JS_TAG_BIG_INT:
-        p = JS_VALUE_GET_PTR(val);
-        r = &p->num;
-        break;
-    case JS_TAG_UNDEFINED:
-    default:
-        r = buf;
-        bf_init(ctx->bf_ctx, r);
-        bf_set_nan(r);
-        break;
+    if (v <= JS_SHORT_BIG_INT_MAX) {
+        return __JS_NewShortBigInt(ctx, v);
+    } else {
+        JSBigInt *p;
+        p = js_bigint_new_ui64(ctx, v);
+        if (!p)
+            return JS_EXCEPTION;
+        return JS_MKPTR(JS_TAG_BIG_INT, p);
     }
-    return r;
 }
 
 /* return NaN if bad bigint literal */
@@ -12325,6 +13753,7 @@ static JSValue JS_StringToBigInt(JSContext *ctx, JSValue val)
     JS_FreeValue(ctx, val);
     if (!str)
         return JS_EXCEPTION;
+    // TODO(saghul): sync with bellard/quickjs ?
     flags = ATOD_WANT_BIG_INT |
         ATOD_TRIM_SPACES | ATOD_ACCEPT_EMPTY |
         ATOD_ACCEPT_HEX_PREFIX | ATOD_ACCEPT_BIN_OCT |
@@ -12342,106 +13771,69 @@ static JSValue JS_StringToBigIntErr(JSContext *ctx, JSValue val)
     return val;
 }
 
-/* if the returned bigint is allocated it is equal to
-   'buf'. Otherwise it is a pointer to the bigint in 'val'. */
-static bf_t *JS_ToBigIntFree(JSContext *ctx, bf_t *buf, JSValue val)
+/* JS Numbers are not allowed */
+static JSValue JS_ToBigIntFree(JSContext *ctx, JSValue val)
 {
     uint32_t tag;
-    bf_t *r;
-    JSBigInt *p;
 
  redo:
     tag = JS_VALUE_GET_NORM_TAG(val);
     switch(tag) {
+    case JS_TAG_SHORT_BIG_INT:
+    case JS_TAG_BIG_INT:
+        break;
     case JS_TAG_INT:
     case JS_TAG_NULL:
     case JS_TAG_UNDEFINED:
     case JS_TAG_FLOAT64:
         goto fail;
     case JS_TAG_BOOL:
-        r = buf;
-        bf_init(ctx->bf_ctx, r);
-        bf_set_si(r, JS_VALUE_GET_INT(val));
-        break;
-    case JS_TAG_BIG_INT:
-        p = JS_VALUE_GET_PTR(val);
-        r = &p->num;
+        val = __JS_NewShortBigInt(ctx, JS_VALUE_GET_INT(val));
         break;
     case JS_TAG_STRING:
         val = JS_StringToBigIntErr(ctx, val);
         if (JS_IsException(val))
-            return NULL;
+            return val;
         goto redo;
     case JS_TAG_OBJECT:
         val = JS_ToPrimitiveFree(ctx, val, HINT_NUMBER);
         if (JS_IsException(val))
-            return NULL;
+            return val;
         goto redo;
     default:
     fail:
         JS_FreeValue(ctx, val);
-        JS_ThrowTypeError(ctx, "cannot convert to BigInt");
-        return NULL;
-    }
-    return r;
-}
-
-static bf_t *JS_ToBigInt(JSContext *ctx, bf_t *buf, JSValueConst val)
-{
-    return JS_ToBigIntFree(ctx, buf, js_dup(val));
-}
-
-static __maybe_unused JSValue JS_ToBigIntValueFree(JSContext *ctx, JSValue val)
-{
-    if (JS_VALUE_GET_TAG(val) == JS_TAG_BIG_INT) {
-        return val;
-    } else {
-        bf_t a_s, *a, *r;
-        int ret;
-        JSValue res;
-
-        res = JS_NewBigInt(ctx);
-        if (JS_IsException(res))
-            return JS_EXCEPTION;
-        a = JS_ToBigIntFree(ctx, &a_s, val);
-        if (!a) {
-            JS_FreeValue(ctx, res);
-            return JS_EXCEPTION;
-        }
-        r = JS_GetBigInt(res);
-        ret = bf_set(r, a);
-        JS_FreeBigInt(ctx, a, &a_s);
-        if (ret) {
-            JS_FreeValue(ctx, res);
-            return JS_ThrowOutOfMemory(ctx);
-        }
-        return JS_CompactBigInt(ctx, res);
+        return JS_ThrowTypeError(ctx, "cannot convert to bigint");
     }
+    return val;
 }
 
-/* free the bf_t allocated by JS_ToBigInt */
-static void JS_FreeBigInt(JSContext *ctx, bf_t *a, bf_t *buf)
+static JSValue JS_ToBigInt(JSContext *ctx, JSValueConst val)
 {
-    if (a == buf) {
-        bf_delete(a);
-    } else {
-        JSBigInt *p = (JSBigInt *)((uint8_t *)a - offsetof(JSBigInt, num));
-        JS_FreeValue(ctx, JS_MKPTR(JS_TAG_BIG_INT, p));
-    }
+    return JS_ToBigIntFree(ctx, js_dup(val));
 }
 
 /* XXX: merge with JS_ToInt64Free with a specific flag */
 static int JS_ToBigInt64Free(JSContext *ctx, int64_t *pres, JSValue val)
 {
-    bf_t a_s, *a;
+    uint64_t res;
 
-    a = JS_ToBigIntFree(ctx, &a_s, val);
-    if (!a) {
+    val = JS_ToBigIntFree(ctx, val);
+    if (JS_IsException(val)) {
         *pres = 0;
         return -1;
     }
-    bf_get_int64(pres, a, BF_GET_INT_MOD);
-    JS_FreeBigInt(ctx, a, &a_s);
+    if (JS_VALUE_GET_TAG(val) == JS_TAG_SHORT_BIG_INT) {
+        res = JS_VALUE_GET_SHORT_BIG_INT(val);
+    } else {
+        JSBigInt *p = JS_VALUE_GET_PTR(val);
+        /* return the value mod 2^64 */
+        res = p->tab[0];
+        if (p->len >= 2)
+            res |= (uint64_t)p->tab[1] << 32;
+        JS_FreeValue(ctx, val);
+    }
+    *pres = res;
     return 0;
 }
 
@@ -12455,103 +13847,6 @@ int JS_ToBigUint64(JSContext *ctx, uint64_t *pres, JSValueConst val)
     return JS_ToBigInt64Free(ctx, (int64_t *)pres, js_dup(val));
 }
 
-static JSValue JS_NewBigInt(JSContext *ctx)
-{
-    JSBigInt *p;
-    p = js_malloc(ctx, sizeof(*p));
-    if (!p)
-        return JS_EXCEPTION;
-    p->header.ref_count = 1;
-    bf_init(ctx->bf_ctx, &p->num);
-    return JS_MKPTR(JS_TAG_BIG_INT, p);
-}
-
-static JSValue JS_CompactBigInt1(JSContext *ctx, JSValue val)
-{
-    if (JS_VALUE_GET_TAG(val) != JS_TAG_BIG_INT)
-        return val; /* fail safe */
-    bf_t *a = JS_GetBigInt(val);
-    if (a->expn == BF_EXP_ZERO && a->sign) {
-        assert(((JSBigInt*)JS_VALUE_GET_PTR(val))->header.ref_count == 1);
-        a->sign = 0;
-    }
-    return val;
-}
-
-/* Nnormalize the zero representation. Could also be used to convert the bigint
-   to a short bigint value. The reference count of the value must be
-   1. Cannot fail */
-static JSValue JS_CompactBigInt(JSContext *ctx, JSValue val)
-{
-    return JS_CompactBigInt1(ctx, val);
-}
-
-static JSValue throw_bf_exception(JSContext *ctx, int status)
-{
-    const char *str;
-    if (status & BF_ST_MEM_ERROR)
-        return JS_ThrowOutOfMemory(ctx);
-    if (status & BF_ST_DIVIDE_ZERO) {
-        str = "division by zero";
-    } else if (status & BF_ST_INVALID_OP) {
-        str = "invalid operation";
-    } else {
-        str = "integer overflow";
-    }
-    return JS_ThrowRangeError(ctx, "%s", str);
-}
-
-static int js_unary_arith_bigint(JSContext *ctx,
-                                 JSValue *pres, OPCodeEnum op, JSValue op1)
-{
-    bf_t a_s, *r, *a;
-    int ret, v;
-    JSValue res;
-
-    if (op == OP_plus) {
-        JS_ThrowTypeError(ctx, "BigInt argument with unary +");
-        JS_FreeValue(ctx, op1);
-        return -1;
-    }
-    res = JS_NewBigInt(ctx);
-    if (JS_IsException(res)) {
-        JS_FreeValue(ctx, op1);
-        return -1;
-    }
-    r = JS_GetBigInt(res);
-    a = JS_ToBigIntFree(ctx, &a_s, op1); // infallible, always a bigint
-    ret = 0;
-    switch(op) {
-    case OP_inc:
-    case OP_dec:
-        v = 2 * (op - OP_dec) - 1;
-        ret = bf_add_si(r, a, v, BF_PREC_INF, BF_RNDZ);
-        break;
-    case OP_plus:
-        ret = bf_set(r, a);
-        break;
-    case OP_neg:
-        ret = bf_set(r, a);
-        bf_neg(r);
-        break;
-    case OP_not:
-        ret = bf_add_si(r, a, 1, BF_PREC_INF, BF_RNDZ);
-        bf_neg(r);
-        break;
-    default:
-        abort();
-    }
-    JS_FreeBigInt(ctx, a, &a_s);
-    if (unlikely(ret)) {
-        JS_FreeValue(ctx, res);
-        throw_bf_exception(ctx, ret);
-        return -1;
-    }
-    res = JS_CompactBigInt(ctx, res);
-    *pres = res;
-    return 0;
-}
-
 static no_inline __exception int js_unary_arith_slow(JSContext *ctx,
                                                      JSValue *sp,
                                                      OPCodeEnum op)
@@ -12559,12 +13854,13 @@ static no_inline __exception int js_unary_arith_slow(JSContext *ctx,
     JSValue op1;
     int v;
     uint32_t tag;
+    JSBigIntBuf buf1;
+    JSBigInt *p1;
 
     op1 = sp[-1];
     /* fast path for float64 */
     if (JS_TAG_IS_FLOAT64(JS_VALUE_GET_TAG(op1)))
         goto handle_float64;
-
     op1 = JS_ToNumericFree(ctx, op1);
     if (JS_IsException(op1))
         goto exception;
@@ -12593,17 +13889,80 @@ static no_inline __exception int js_unary_arith_slow(JSContext *ctx,
             default:
                 abort();
             }
-            sp[-1] = js_int64(v64);
+            sp[-1] = JS_NewInt64(ctx, v64);
+        }
+        break;
+    case JS_TAG_SHORT_BIG_INT:
+        {
+            int64_t v;
+            v = JS_VALUE_GET_SHORT_BIG_INT(op1);
+            switch(op) {
+            case OP_plus:
+                JS_ThrowTypeError(ctx, "bigint argument with unary +");
+                goto exception;
+            case OP_inc:
+                if (v == JS_SHORT_BIG_INT_MAX)
+                    goto bigint_slow_case;
+                sp[-1] = __JS_NewShortBigInt(ctx, v + 1);
+                break;
+            case OP_dec:
+                if (v == JS_SHORT_BIG_INT_MIN)
+                    goto bigint_slow_case;
+                sp[-1] = __JS_NewShortBigInt(ctx, v - 1);
+                break;
+            case OP_neg:
+                v = JS_VALUE_GET_SHORT_BIG_INT(op1);
+                if (v == JS_SHORT_BIG_INT_MIN) {
+                bigint_slow_case:
+                    p1 = js_bigint_set_short(&buf1, op1);
+                    goto bigint_slow_case1;
+                }
+                sp[-1] = __JS_NewShortBigInt(ctx, -v);
+                break;
+            default:
+                abort();
+            }
         }
         break;
     case JS_TAG_BIG_INT:
-        if (js_unary_arith_bigint(ctx, sp - 1, op, op1))
-            goto exception;
+        {
+            JSBigInt *r;
+            p1 = JS_VALUE_GET_PTR(op1);
+        bigint_slow_case1:
+            switch(op) {
+            case OP_plus:
+                JS_ThrowTypeError(ctx, "bigint argument with unary +");
+                JS_FreeValue(ctx, op1);
+                goto exception;
+            case OP_inc:
+            case OP_dec:
+                {
+                    JSBigIntBuf buf2;
+                    JSBigInt *p2;
+                    p2 = js_bigint_set_si(&buf2, 2 * (op - OP_dec) - 1);
+                    r = js_bigint_add(ctx, p1, p2, 0);
+                }
+                break;
+            case OP_neg:
+                r = js_bigint_neg(ctx, p1);
+                break;
+            case OP_not:
+                r = js_bigint_not(ctx, p1);
+                break;
+            default:
+                abort();
+            }
+            JS_FreeValue(ctx, op1);
+            if (!r)
+                goto exception;
+            sp[-1] = JS_CompactBigInt(ctx, r);
+        }
         break;
     default:
     handle_float64:
         {
-            double d = JS_VALUE_GET_FLOAT64(op1);
+            double d;
+            d = JS_VALUE_GET_FLOAT64(op1);
             switch(op) {
             case OP_inc:
             case OP_dec:
@@ -12649,17 +14008,24 @@ static no_inline int js_not_slow(JSContext *ctx, JSValue *sp)
 {
     JSValue op1;
 
-    op1 = JS_ToNumericFree(ctx, sp[-1]);
+    op1 = sp[-1];
+    op1 = JS_ToNumericFree(ctx, op1);
     if (JS_IsException(op1))
         goto exception;
-    if (JS_VALUE_GET_TAG(op1) == JS_TAG_BIG_INT) {
-        if (js_unary_arith_bigint(ctx, sp - 1, OP_not, op1))
+    if (JS_VALUE_GET_TAG(op1) == JS_TAG_SHORT_BIG_INT) {
+        sp[-1] = __JS_NewShortBigInt(ctx, ~JS_VALUE_GET_SHORT_BIG_INT(op1));
+    } else if (JS_VALUE_GET_TAG(op1) == JS_TAG_BIG_INT) {
+        JSBigInt *r;
+        r = js_bigint_not(ctx, JS_VALUE_GET_PTR(op1));
+        JS_FreeValue(ctx, op1);
+        if (!r)
             goto exception;
+        sp[-1] = JS_CompactBigInt(ctx, r);
     } else {
         int32_t v1;
         if (unlikely(JS_ToInt32Free(ctx, &v1, op1)))
             goto exception;
-        sp[-1] = js_int32(~v1);
+        sp[-1] = JS_NewInt32(ctx, ~v1);
     }
     return 0;
  exception:
@@ -12667,107 +14033,6 @@ static no_inline int js_not_slow(JSContext *ctx, JSValue *sp)
     return -1;
 }
 
-static int js_binary_arith_bigint(JSContext *ctx, OPCodeEnum op,
-                                  JSValue *pres, JSValue op1, JSValue op2)
-{
-    bf_t a_s, b_s, *r, *a, *b;
-    int ret;
-    JSValue res;
-
-    a = JS_ToBigIntFree(ctx, &a_s, op1);
-    if (!a) {
-        JS_FreeValue(ctx, op2);
-        return -1;
-    }
-    b = JS_ToBigIntFree(ctx, &b_s, op2);
-    if (!b) {
-        JS_FreeBigInt(ctx, a, &a_s);
-        return -1;
-    }
-    res = JS_NewBigInt(ctx);
-    if (JS_IsException(res)) {
-        JS_FreeBigInt(ctx, a, &a_s);
-        JS_FreeBigInt(ctx, b, &b_s);
-        return -1;
-    }
-    r = JS_GetBigInt(res);
-    ret = 0;
-    switch(op) {
-    case OP_add:
-        ret = bf_add(r, a, b, BF_PREC_INF, BF_RNDZ);
-        break;
-    case OP_sub:
-        ret = bf_sub(r, a, b, BF_PREC_INF, BF_RNDZ);
-        break;
-    case OP_mul:
-        ret = bf_mul(r, a, b, BF_PREC_INF, BF_RNDZ);
-        break;
-    case OP_div:
-        {
-            bf_t rem_s, *rem = &rem_s;
-            bf_init(ctx->bf_ctx, rem);
-            ret = bf_divrem(r, rem, a, b, BF_PREC_INF, BF_RNDZ, BF_RNDZ);
-            bf_delete(rem);
-        }
-        break;
-    case OP_mod:
-        ret = bf_rem(r, a, b, BF_PREC_INF, BF_RNDZ,
-                     BF_RNDZ) & BF_ST_INVALID_OP;
-        break;
-    case OP_pow:
-        if (b->sign) {
-            ret = BF_ST_INVALID_OP;
-        } else {
-            ret = bf_pow(r, a, b, BF_PREC_INF, BF_RNDZ | BF_POW_JS_QUIRKS);
-        }
-        break;
-
-        /* logical operations */
-    case OP_shl:
-    case OP_sar:
-        {
-            slimb_t v2;
-#if LIMB_BITS == 32
-            bf_get_int32(&v2, b, 0);
-            if (v2 == INT32_MIN)
-                v2 = INT32_MIN + 1;
-#else
-            bf_get_int64(&v2, b, 0);
-            if (v2 == INT64_MIN)
-                v2 = INT64_MIN + 1;
-#endif
-            if (op == OP_sar)
-                v2 = -v2;
-            ret = bf_set(r, a);
-            ret |= bf_mul_2exp(r, v2, BF_PREC_INF, BF_RNDZ);
-            if (v2 < 0) {
-                ret |= bf_rint(r, BF_RNDD) & (BF_ST_OVERFLOW | BF_ST_MEM_ERROR);
-            }
-        }
-        break;
-    case OP_and:
-        ret = bf_logic_and(r, a, b);
-        break;
-    case OP_or:
-        ret = bf_logic_or(r, a, b);
-        break;
-    case OP_xor:
-        ret = bf_logic_xor(r, a, b);
-        break;
-    default:
-        abort();
-    }
-    JS_FreeBigInt(ctx, a, &a_s);
-    JS_FreeBigInt(ctx, b, &b_s);
-    if (unlikely(ret)) {
-        JS_FreeValue(ctx, res);
-        throw_bf_exception(ctx, ret);
-        return -1;
-    }
-    *pres = JS_CompactBigInt(ctx, res);
-    return 0;
-}
-
 static no_inline __exception int js_binary_arith_slow(JSContext *ctx, JSValue *sp,
                                                       OPCodeEnum op)
 {
@@ -12785,7 +14050,50 @@ static no_inline __exception int js_binary_arith_slow(JSContext *ctx, JSValue *s
         d2 = JS_VALUE_GET_FLOAT64(op2);
         goto handle_float64;
     }
-
+    /* fast path for short big int operations */
+    if (tag1 == JS_TAG_SHORT_BIG_INT && tag2 == JS_TAG_SHORT_BIG_INT) {
+        js_slimb_t v1, v2;
+        js_sdlimb_t v;
+        v1 = JS_VALUE_GET_SHORT_BIG_INT(op1);
+        v2 = JS_VALUE_GET_SHORT_BIG_INT(op2);
+        switch(op) {
+        case OP_sub:
+            v = (js_sdlimb_t)v1 - (js_sdlimb_t)v2;
+            break;
+        case OP_mul:
+            v = (js_sdlimb_t)v1 * (js_sdlimb_t)v2;
+            break;
+        case OP_div:
+            if (v2 == 0 ||
+                ((js_limb_t)v1 == (js_limb_t)1 << (JS_LIMB_BITS - 1) &&
+                 v2 == -1)) {
+                goto slow_big_int;
+            }
+            sp[-2] = __JS_NewShortBigInt(ctx, v1 / v2);
+            return 0;
+        case OP_mod:
+            if (v2 == 0 ||
+                ((js_limb_t)v1 == (js_limb_t)1 << (JS_LIMB_BITS - 1) &&
+                 v2 == -1)) {
+                goto slow_big_int;
+            }
+            sp[-2] = __JS_NewShortBigInt(ctx, v1 % v2);
+            return 0;
+        case OP_pow:
+            goto slow_big_int;
+        default:
+            abort();
+        }
+        if (likely(v >= JS_SHORT_BIG_INT_MIN && v <= JS_SHORT_BIG_INT_MAX)) {
+            sp[-2] = __JS_NewShortBigInt(ctx, v);
+        } else {
+            JSBigInt *r = js_bigint_new_di(ctx, v);
+            if (!r)
+                goto exception;
+            sp[-2] = JS_MKPTR(JS_TAG_BIG_INT, r);
+        }
+        return 0;
+    }
     op1 = JS_ToNumericFree(ctx, op1);
     if (JS_IsException(op1)) {
         JS_FreeValue(ctx, op2);
@@ -12816,7 +14124,7 @@ static no_inline __exception int js_binary_arith_slow(JSContext *ctx, JSValue *s
             }
             break;
         case OP_div:
-            sp[-2] = js_float64((double)v1 / (double)v2);
+            sp[-2] = js_number((double)v1 / (double)v2);
             return 0;
         case OP_mod:
             if (v1 < 0 || v2 <= 0) {
@@ -12833,9 +14141,47 @@ static no_inline __exception int js_binary_arith_slow(JSContext *ctx, JSValue *s
             abort();
         }
         sp[-2] = js_int64(v);
-    } else if (tag1 == JS_TAG_BIG_INT || tag2 == JS_TAG_BIG_INT) {
-        if (js_binary_arith_bigint(ctx, op, sp - 2, op1, op2))
+    } else if ((tag1 == JS_TAG_SHORT_BIG_INT || tag1 == JS_TAG_BIG_INT) &&
+               (tag2 == JS_TAG_SHORT_BIG_INT || tag2 == JS_TAG_BIG_INT)) {
+        JSBigInt *p1, *p2, *r;
+        JSBigIntBuf buf1, buf2;
+    slow_big_int:
+        /* bigint result */
+        if (JS_VALUE_GET_TAG(op1) == JS_TAG_SHORT_BIG_INT)
+            p1 = js_bigint_set_short(&buf1, op1);
+        else
+            p1 = JS_VALUE_GET_PTR(op1);
+        if (JS_VALUE_GET_TAG(op2) == JS_TAG_SHORT_BIG_INT)
+            p2 = js_bigint_set_short(&buf2, op2);
+        else
+            p2 = JS_VALUE_GET_PTR(op2);
+        switch(op) {
+        case OP_add:
+            r = js_bigint_add(ctx, p1, p2, 0);
+            break;
+        case OP_sub:
+            r = js_bigint_add(ctx, p1, p2, 1);
+            break;
+        case OP_mul:
+            r = js_bigint_mul(ctx, p1, p2);
+            break;
+        case OP_div:
+            r = js_bigint_divrem(ctx, p1, p2, false);
+            break;
+        case OP_mod:
+            r = js_bigint_divrem(ctx, p1, p2, true);
+            break;
+        case OP_pow:
+            r = js_bigint_pow(ctx, p1, p2);
+            break;
+        default:
+            abort();
+        }
+        JS_FreeValue(ctx, op1);
+        JS_FreeValue(ctx, op2);
+        if (!r)
             goto exception;
+        sp[-2] = JS_CompactBigInt(ctx, r);
     } else {
         double dr;
         /* float64 result */
@@ -12892,6 +14238,23 @@ static no_inline __exception int js_add_slow(JSContext *ctx, JSValue *sp)
         sp[-2] = js_float64(d1 + d2);
         return 0;
     }
+    /* fast path for short bigint */
+    if (tag1 == JS_TAG_SHORT_BIG_INT && tag2 == JS_TAG_SHORT_BIG_INT) {
+        js_slimb_t v1, v2;
+        js_sdlimb_t v;
+        v1 = JS_VALUE_GET_SHORT_BIG_INT(op1);
+        v2 = JS_VALUE_GET_SHORT_BIG_INT(op2);
+        v = (js_sdlimb_t)v1 + (js_sdlimb_t)v2;
+        if (likely(v >= JS_SHORT_BIG_INT_MIN && v <= JS_SHORT_BIG_INT_MAX)) {
+            sp[-2] = __JS_NewShortBigInt(ctx, v);
+        } else {
+            JSBigInt *r = js_bigint_new_di(ctx, v);
+            if (!r)
+                goto exception;
+            sp[-2] = JS_MKPTR(JS_TAG_BIG_INT, r);
+        }
+        return 0;
+    }
 
     if (tag1 == JS_TAG_OBJECT || tag2 == JS_TAG_OBJECT) {
         op1 = JS_ToPrimitiveFree(ctx, op1, HINT_NONE);
@@ -12935,10 +14298,26 @@ static no_inline __exception int js_add_slow(JSContext *ctx, JSValue *sp)
         v1 = JS_VALUE_GET_INT(op1);
         v2 = JS_VALUE_GET_INT(op2);
         v = (int64_t)v1 + (int64_t)v2;
-        sp[-2] = js_int64(v);
-    } else if (tag1 == JS_TAG_BIG_INT || tag2 == JS_TAG_BIG_INT) {
-        if (js_binary_arith_bigint(ctx, OP_add, sp - 2, op1, op2))
+        sp[-2] = JS_NewInt64(ctx, v);
+    } else if ((tag1 == JS_TAG_BIG_INT || tag1 == JS_TAG_SHORT_BIG_INT) &&
+               (tag2 == JS_TAG_BIG_INT || tag2 == JS_TAG_SHORT_BIG_INT)) {
+        JSBigInt *p1, *p2, *r;
+        JSBigIntBuf buf1, buf2;
+        /* bigint result */
+        if (JS_VALUE_GET_TAG(op1) == JS_TAG_SHORT_BIG_INT)
+            p1 = js_bigint_set_short(&buf1, op1);
+        else
+            p1 = JS_VALUE_GET_PTR(op1);
+        if (JS_VALUE_GET_TAG(op2) == JS_TAG_SHORT_BIG_INT)
+            p2 = js_bigint_set_short(&buf2, op2);
+        else
+            p2 = JS_VALUE_GET_PTR(op2);
+        r = js_bigint_add(ctx, p1, p2, 0);
+        JS_FreeValue(ctx, op1);
+        JS_FreeValue(ctx, op2);
+        if (!r)
             goto exception;
+        sp[-2] = JS_CompactBigInt(ctx, r);
     } else {
         double d1, d2;
         /* float64 result */
@@ -12970,6 +14349,62 @@ static no_inline __exception int js_binary_logic_slow(JSContext *ctx,
     tag1 = JS_VALUE_GET_NORM_TAG(op1);
     tag2 = JS_VALUE_GET_NORM_TAG(op2);
 
+    if (tag1 == JS_TAG_SHORT_BIG_INT && tag2 == JS_TAG_SHORT_BIG_INT) {
+        js_slimb_t v1, v2, v;
+        js_sdlimb_t vd;
+        v1 = JS_VALUE_GET_SHORT_BIG_INT(op1);
+        v2 = JS_VALUE_GET_SHORT_BIG_INT(op2);
+        /* bigint fast path */
+        switch(op) {
+        case OP_and:
+            v = v1 & v2;
+            break;
+        case OP_or:
+            v = v1 | v2;
+            break;
+        case OP_xor:
+            v = v1 ^ v2;
+            break;
+        case OP_sar:
+            if (v2 > (JS_LIMB_BITS - 1)) {
+                goto slow_big_int;
+            } else if (v2 < 0) {
+                if (v2 < -(JS_LIMB_BITS - 1))
+                    goto slow_big_int;
+                v2 = -v2;
+                goto bigint_shl;
+            }
+        bigint_sar:
+            v = v1 >> v2;
+            break;
+        case OP_shl:
+            if (v2 > (JS_LIMB_BITS - 1)) {
+                goto slow_big_int;
+            } else if (v2 < 0) {
+                if (v2 < -(JS_LIMB_BITS - 1))
+                    goto slow_big_int;
+                v2 = -v2;
+                goto bigint_sar;
+            }
+        bigint_shl:
+            vd = (js_dlimb_t)v1 << v2;
+            if (likely(vd >= JS_SHORT_BIG_INT_MIN &&
+                       vd <= JS_SHORT_BIG_INT_MAX)) {
+                v = vd;
+            } else {
+                JSBigInt *r = js_bigint_new_di(ctx, vd);
+                if (!r)
+                    goto exception;
+                sp[-2] = JS_MKPTR(JS_TAG_BIG_INT, r);
+                return 0;
+            }
+            break;
+        default:
+            abort();
+        }
+        sp[-2] = __JS_NewShortBigInt(ctx, v);
+        return 0;
+    }
     op1 = JS_ToNumericFree(ctx, op1);
     if (JS_IsException(op1)) {
         JS_FreeValue(ctx, op2);
@@ -12983,15 +14418,50 @@ static no_inline __exception int js_binary_logic_slow(JSContext *ctx,
 
     tag1 = JS_VALUE_GET_TAG(op1);
     tag2 = JS_VALUE_GET_TAG(op2);
-    if (tag1 == JS_TAG_BIG_INT || tag2 == JS_TAG_BIG_INT) {
-        if (tag1 != tag2) {
-            JS_FreeValue(ctx, op1);
-            JS_FreeValue(ctx, op2);
-            JS_ThrowTypeError(ctx, "both operands must be BigInt");
-            goto exception;
-        } else if (js_binary_arith_bigint(ctx, op, sp - 2, op1, op2)) {
-            goto exception;
+    if ((tag1 == JS_TAG_BIG_INT || tag1 == JS_TAG_SHORT_BIG_INT) &&
+        (tag2 == JS_TAG_BIG_INT || tag2 == JS_TAG_SHORT_BIG_INT)) {
+        JSBigInt *p1, *p2, *r;
+        JSBigIntBuf buf1, buf2;
+    slow_big_int:
+        if (JS_VALUE_GET_TAG(op1) == JS_TAG_SHORT_BIG_INT)
+            p1 = js_bigint_set_short(&buf1, op1);
+        else
+            p1 = JS_VALUE_GET_PTR(op1);
+        if (JS_VALUE_GET_TAG(op2) == JS_TAG_SHORT_BIG_INT)
+            p2 = js_bigint_set_short(&buf2, op2);
+        else
+            p2 = JS_VALUE_GET_PTR(op2);
+        switch(op) {
+        case OP_and:
+        case OP_or:
+        case OP_xor:
+            r = js_bigint_logic(ctx, p1, p2, op);
+            break;
+        case OP_shl:
+        case OP_sar:
+            {
+                js_slimb_t shift;
+                shift = js_bigint_get_si_sat(p2);
+                if (shift > INT32_MAX)
+                    shift = INT32_MAX;
+                else if (shift < -INT32_MAX)
+                    shift = -INT32_MAX;
+                if (op == OP_sar)
+                    shift = -shift;
+                if (shift >= 0)
+                    r = js_bigint_shl(ctx, p1, shift);
+                else
+                    r = js_bigint_shr(ctx, p1, -shift);
+            }
+            break;
+        default:
+            abort();
         }
+        JS_FreeValue(ctx, op1);
+        JS_FreeValue(ctx, op2);
+        if (!r)
+            goto exception;
+        sp[-2] = JS_CompactBigInt(ctx, r);
     } else {
         if (unlikely(JS_ToInt32Free(ctx, (int32_t *)&v1, op1))) {
             JS_FreeValue(ctx, op2);
@@ -13027,49 +14497,96 @@ static no_inline __exception int js_binary_logic_slow(JSContext *ctx,
     return -1;
 }
 
+/* op1 must be a bigint or int. */
+static JSBigInt *JS_ToBigIntBuf(JSContext *ctx, JSBigIntBuf *buf1,
+                                JSValue op1)
+{
+    JSBigInt *p1;
+
+    switch(JS_VALUE_GET_TAG(op1)) {
+    case JS_TAG_INT:
+        p1 = js_bigint_set_si(buf1, JS_VALUE_GET_INT(op1));
+        break;
+    case JS_TAG_SHORT_BIG_INT:
+        p1 = js_bigint_set_short(buf1, op1);
+        break;
+    case JS_TAG_BIG_INT:
+        p1 = JS_VALUE_GET_PTR(op1);
+        break;
+    default:
+        abort();
+    }
+    return p1;
+}
+
+/* op1 and op2 must be numeric types and at least one must be a
+   bigint. No exception is generated. */
 static int js_compare_bigint(JSContext *ctx, OPCodeEnum op,
                              JSValue op1, JSValue op2)
 {
-    bf_t a_s, b_s, *a, *b;
-    int res;
+    int res, val, tag1, tag2;
+    JSBigIntBuf buf1, buf2;
+    JSBigInt *p1, *p2;
 
-    a = JS_ToBigInt1(ctx, &a_s, op1);
-    if (!a) {
-        JS_FreeValue(ctx, op2);
-        return -1;
-    }
-    b = JS_ToBigInt1(ctx, &b_s, op2);
-    if (!b) {
-        if (a == &a_s)
-            bf_delete(a);
+    tag1 = JS_VALUE_GET_NORM_TAG(op1);
+    tag2 = JS_VALUE_GET_NORM_TAG(op2);
+    if ((tag1 == JS_TAG_SHORT_BIG_INT || tag1 == JS_TAG_INT) &&
+        (tag2 == JS_TAG_SHORT_BIG_INT || tag2 == JS_TAG_INT)) {
+        /* fast path */
+        js_slimb_t v1, v2;
+        if (tag1 == JS_TAG_INT)
+            v1 = JS_VALUE_GET_INT(op1);
+        else
+            v1 = JS_VALUE_GET_SHORT_BIG_INT(op1);
+        if (tag2 == JS_TAG_INT)
+            v2 = JS_VALUE_GET_INT(op2);
+        else
+            v2 = JS_VALUE_GET_SHORT_BIG_INT(op2);
+        val = (v1 > v2) - (v1 < v2);
+    } else {
+        if (tag1 == JS_TAG_FLOAT64) {
+            p2 = JS_ToBigIntBuf(ctx, &buf2, op2);
+            val = js_bigint_float64_cmp(ctx, p2, JS_VALUE_GET_FLOAT64(op1));
+            if (val == 2)
+                goto unordered;
+            val = -val;
+        } else if (tag2 == JS_TAG_FLOAT64) {
+            p1 = JS_ToBigIntBuf(ctx, &buf1, op1);
+            val = js_bigint_float64_cmp(ctx, p1, JS_VALUE_GET_FLOAT64(op2));
+            if (val == 2) {
+            unordered:
+                JS_FreeValue(ctx, op1);
+                JS_FreeValue(ctx, op2);
+                return false;
+            }
+        } else {
+            p1 = JS_ToBigIntBuf(ctx, &buf1, op1);
+            p2 = JS_ToBigIntBuf(ctx, &buf2, op2);
+            val = js_bigint_cmp(ctx, p1, p2);
+        }
         JS_FreeValue(ctx, op1);
-        return -1;
+        JS_FreeValue(ctx, op2);
     }
+
     switch(op) {
     case OP_lt:
-        res = bf_cmp_lt(a, b); /* if NaN return false */
+        res = val < 0;
         break;
     case OP_lte:
-        res = bf_cmp_le(a, b); /* if NaN return false */
+        res = val <= 0;
         break;
     case OP_gt:
-        res = bf_cmp_lt(b, a); /* if NaN return false */
+        res = val > 0;
         break;
     case OP_gte:
-        res = bf_cmp_le(b, a); /* if NaN return false */
+        res = val >= 0;
         break;
     case OP_eq:
-        res = bf_cmp_eq(a, b); /* if NaN return false */
+        res = val == 0;
         break;
     default:
         abort();
     }
-    if (a == &a_s)
-        bf_delete(a);
-    if (b == &b_s)
-        bf_delete(b);
-    JS_FreeValue(ctx, op1);
-    JS_FreeValue(ctx, op2);
     return res;
 }
 
@@ -13125,16 +14642,20 @@ static no_inline int js_relational_slow(JSContext *ctx, JSValue *sp,
         /* fast path for float64/int */
         goto float64_compare;
     } else {
-        if (((tag1 == JS_TAG_BIG_INT && tag2 == JS_TAG_STRING) ||
-             (tag2 == JS_TAG_BIG_INT && tag1 == JS_TAG_STRING))) {
+        if ((((tag1 == JS_TAG_BIG_INT || tag1 == JS_TAG_SHORT_BIG_INT) &&
+              tag2 == JS_TAG_STRING) ||
+             ((tag2 == JS_TAG_BIG_INT || tag2 == JS_TAG_SHORT_BIG_INT) &&
+              tag1 == JS_TAG_STRING))) {
             if (tag1 == JS_TAG_STRING) {
                 op1 = JS_StringToBigInt(ctx, op1);
-                if (JS_VALUE_GET_TAG(op1) != JS_TAG_BIG_INT)
+                if (JS_VALUE_GET_TAG(op1) != JS_TAG_BIG_INT &&
+                    JS_VALUE_GET_TAG(op1) != JS_TAG_SHORT_BIG_INT)
                     goto invalid_bigint_string;
             }
             if (tag2 == JS_TAG_STRING) {
                 op2 = JS_StringToBigInt(ctx, op2);
-                if (JS_VALUE_GET_TAG(op2) != JS_TAG_BIG_INT) {
+                if (JS_VALUE_GET_TAG(op2) != JS_TAG_BIG_INT &&
+                    JS_VALUE_GET_TAG(op2) != JS_TAG_SHORT_BIG_INT) {
                 invalid_bigint_string:
                     JS_FreeValue(ctx, op1);
                     JS_FreeValue(ctx, op2);
@@ -13158,10 +14679,9 @@ static no_inline int js_relational_slow(JSContext *ctx, JSValue *sp,
         tag1 = JS_VALUE_GET_NORM_TAG(op1);
         tag2 = JS_VALUE_GET_NORM_TAG(op2);
 
-        if (tag1 == JS_TAG_BIG_INT || tag2 == JS_TAG_BIG_INT) {
+        if (tag1 == JS_TAG_BIG_INT || tag1 == JS_TAG_SHORT_BIG_INT ||
+            tag2 == JS_TAG_BIG_INT || tag2 == JS_TAG_SHORT_BIG_INT) {
             res = js_compare_bigint(ctx, op, op1, op2);
-            if (res < 0)
-                goto exception;
         } else {
             double d1, d2;
 
@@ -13205,8 +14725,9 @@ static no_inline int js_relational_slow(JSContext *ctx, JSValue *sp,
 
 static bool tag_is_number(uint32_t tag)
 {
-    return (tag == JS_TAG_INT || tag == JS_TAG_BIG_INT ||
-            tag == JS_TAG_FLOAT64);
+    return (tag == JS_TAG_INT ||
+            tag == JS_TAG_FLOAT64 ||
+            tag == JS_TAG_BIG_INT || tag == JS_TAG_SHORT_BIG_INT);
 }
 
 static no_inline __exception int js_eq_slow(JSContext *ctx, JSValue *sp,
@@ -13253,15 +14774,18 @@ static no_inline __exception int js_eq_slow(JSContext *ctx, JSValue *sp,
     } else if ((tag1 == JS_TAG_STRING && tag_is_number(tag2)) ||
                (tag2 == JS_TAG_STRING && tag_is_number(tag1))) {
 
-        if ((tag1 == JS_TAG_BIG_INT || tag2 == JS_TAG_BIG_INT)) {
+        if (tag1 == JS_TAG_BIG_INT || tag1 == JS_TAG_SHORT_BIG_INT ||
+            tag2 == JS_TAG_BIG_INT || tag2 == JS_TAG_SHORT_BIG_INT) {
             if (tag1 == JS_TAG_STRING) {
                 op1 = JS_StringToBigInt(ctx, op1);
-                if (JS_VALUE_GET_TAG(op1) != JS_TAG_BIG_INT)
+                if (JS_VALUE_GET_TAG(op1) != JS_TAG_BIG_INT &&
+                    JS_VALUE_GET_TAG(op1) != JS_TAG_SHORT_BIG_INT)
                     goto invalid_bigint_string;
             }
             if (tag2 == JS_TAG_STRING) {
                 op2 = JS_StringToBigInt(ctx, op2);
-                if (JS_VALUE_GET_TAG(op2) != JS_TAG_BIG_INT) {
+                if (JS_VALUE_GET_TAG(op2) != JS_TAG_BIG_INT &&
+                    JS_VALUE_GET_TAG(op2) != JS_TAG_SHORT_BIG_INT ) {
                 invalid_bigint_string:
                     JS_FreeValue(ctx, op1);
                     JS_FreeValue(ctx, op2);
@@ -13343,8 +14867,10 @@ static no_inline int js_shr_slow(JSContext *ctx, JSValue *sp)
         goto exception;
     }
 
-    if ((JS_VALUE_GET_TAG(op1) == JS_TAG_BIG_INT ||
-         JS_VALUE_GET_TAG(op2) == JS_TAG_BIG_INT)) {
+    if (JS_VALUE_GET_TAG(op1) == JS_TAG_BIG_INT ||
+        JS_VALUE_GET_TAG(op1) == JS_TAG_SHORT_BIG_INT ||
+        JS_VALUE_GET_TAG(op2) == JS_TAG_BIG_INT ||
+        JS_VALUE_GET_TAG(op2) == JS_TAG_SHORT_BIG_INT) {
         JS_ThrowTypeError(ctx, "BigInt operands are forbidden for >>>");
         JS_FreeValue(ctx, op1);
         JS_FreeValue(ctx, op2);
@@ -13453,20 +14979,27 @@ static bool js_strict_eq2(JSContext *ctx, JSValue op1, JSValue op2,
             res = (d1 == d2); /* if NaN return false and +0 == -0 */
         }
         goto done_no_free;
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         {
-            bf_t a_s, *a, b_s, *b;
-            if (tag1 != tag2) {
+            JSBigIntBuf buf1, buf2;
+            JSBigInt *p1, *p2;
+
+            if (tag2 != JS_TAG_SHORT_BIG_INT &&
+                tag2 != JS_TAG_BIG_INT) {
                 res = false;
                 break;
             }
-            a = JS_ToBigInt1(ctx, &a_s, op1);
-            b = JS_ToBigInt1(ctx, &b_s, op2);
-            res = bf_cmp_eq(a, b);
-            if (a == &a_s)
-                bf_delete(a);
-            if (b == &b_s)
-                bf_delete(b);
+
+            if (JS_VALUE_GET_TAG(op1) == JS_TAG_SHORT_BIG_INT)
+                p1 = js_bigint_set_short(&buf1, op1);
+            else
+                p1 = JS_VALUE_GET_PTR(op1);
+            if (JS_VALUE_GET_TAG(op2) == JS_TAG_SHORT_BIG_INT)
+                p2 = js_bigint_set_short(&buf2, op2);
+            else
+                p2 = JS_VALUE_GET_PTR(op2);
+            res = (js_bigint_cmp(ctx, p1, p2) == 0);
         }
         break;
     default:
@@ -13605,6 +15138,7 @@ static __exception int js_operator_typeof(JSContext *ctx, JSValue op1)
 
     tag = JS_VALUE_GET_NORM_TAG(op1);
     switch(tag) {
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         atom = JS_ATOM_bigint;
         break;
@@ -14130,9 +15664,13 @@ static JSValue JS_IteratorNext(JSContext *ctx, JSValueConst enum_obj,
     obj = JS_IteratorNext2(ctx, enum_obj, method, argc, argv, &done);
     if (JS_IsException(obj))
         goto fail;
-    if (done != 2) {
-        *pdone = done;
+    if (likely(done == 0)) {
+        *pdone = false;
         return obj;
+    } else if (done != 2) {
+        JS_FreeValue(ctx, obj);
+        *pdone = true;
+        return JS_UNDEFINED;
     } else {
         done_val = JS_GetProperty(ctx, obj, JS_ATOM_done);
         if (JS_IsException(done_val))
@@ -14505,10 +16043,16 @@ static JSVarRef *get_var_ref(JSContext *ctx, JSStackFrame *sf,
 {
     JSVarRef *var_ref;
     struct list_head *el;
+    JSValue *pvalue;
+
+    if (is_arg)
+        pvalue = &sf->arg_buf[var_idx];
+    else
+        pvalue = &sf->var_buf[var_idx];
 
     list_for_each(el, &sf->var_ref_list) {
         var_ref = list_entry(el, JSVarRef, header.link);
-        if (var_ref->var_idx == var_idx && var_ref->is_arg == is_arg) {
+        if (var_ref->pvalue == pvalue) {
             var_ref->header.ref_count++;
             return var_ref;
         }
@@ -14519,13 +16063,8 @@ static JSVarRef *get_var_ref(JSContext *ctx, JSStackFrame *sf,
         return NULL;
     var_ref->header.ref_count = 1;
     var_ref->is_detached = false;
-    var_ref->is_arg = is_arg;
-    var_ref->var_idx = var_idx;
     list_add_tail(&var_ref->header.link, &sf->var_ref_list);
-    if (is_arg)
-        var_ref->pvalue = &sf->arg_buf[var_idx];
-    else
-        var_ref->pvalue = &sf->var_buf[var_idx];
+    var_ref->pvalue = pvalue;
     var_ref->value = JS_UNDEFINED;
     return var_ref;
 }
@@ -14754,15 +16293,10 @@ static void close_var_refs(JSRuntime *rt, JSStackFrame *sf)
 {
     struct list_head *el, *el1;
     JSVarRef *var_ref;
-    int var_idx;
 
     list_for_each_safe(el, el1, &sf->var_ref_list) {
         var_ref = list_entry(el, JSVarRef, header.link);
-        var_idx = var_ref->var_idx;
-        if (var_ref->is_arg)
-            var_ref->value = js_dup(sf->arg_buf[var_idx]);
-        else
-            var_ref->value = js_dup(sf->var_buf[var_idx]);
+        var_ref->value = js_dup(*var_ref->pvalue);
         var_ref->pvalue = &var_ref->value;
         /* the reference is no longer to a local variable */
         var_ref->is_detached = true;
@@ -14772,13 +16306,15 @@ static void close_var_refs(JSRuntime *rt, JSStackFrame *sf)
 
 static void close_lexical_var(JSContext *ctx, JSStackFrame *sf, int var_idx)
 {
+    JSValue *pvalue;
     struct list_head *el, *el1;
     JSVarRef *var_ref;
 
+    pvalue = &sf->var_buf[var_idx];
     list_for_each_safe(el, el1, &sf->var_ref_list) {
         var_ref = list_entry(el, JSVarRef, header.link);
-        if (var_idx == var_ref->var_idx && !var_ref->is_arg) {
-            var_ref->value = js_dup(sf->var_buf[var_idx]);
+        if (var_ref->pvalue == pvalue) {
+            var_ref->value = js_dup(*var_ref->pvalue);
             var_ref->pvalue = &var_ref->value;
             list_del(&var_ref->header.link);
             /* the reference is no longer to a local variable */
@@ -15128,6 +16664,10 @@ static JSValue JS_CallInternal(JSContext *caller_ctx, JSValueConst func_obj,
             *sp++ = js_int32(get_u32(pc));
             pc += 4;
             BREAK;
+        CASE(OP_push_bigint_i32):
+            *sp++ = __JS_NewShortBigInt(ctx, (int)get_u32(pc));
+            pc += 4;
+            BREAK;
         CASE(OP_push_const):
             *sp++ = js_dup(b->cpool[get_u32(pc)]);
             pc += 4;
@@ -18978,8 +20518,6 @@ static const JSOpCode opcode_info[OP_COUNT + (OP_TEMP_END - OP_TEMP_START)] = {
     opcode_info[(op) >= OP_TEMP_START ? \
                 (op) + (OP_TEMP_END - OP_TEMP_START) : (op)]
 
-static __exception int next_token(JSParseState *s);
-
 static void free_token(JSParseState *s, JSToken *token)
 {
     switch(token->val) {
@@ -19086,6 +20624,10 @@ int JS_PRINTF_FORMAT_ATTR(2, 3) js_parse_error(JSParseState *s, JS_PRINTF_FORMAT
     return -1;
 }
 
+#ifndef QJS_DISABLE_PARSER
+
+static __exception int next_token(JSParseState *s);
+
 static int js_parse_expect(JSParseState *s, int tok)
 {
     char buf[ATOM_GET_STR_BUF_SIZE];
@@ -19430,6 +20972,8 @@ static __exception int js_parse_regexp(JSParseState *s)
     return -1;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize,
                                      char *static_buf)
 {
@@ -19457,6 +21001,8 @@ static __exception int ident_realloc(JSContext *ctx, char **pbuf, size_t *psize,
     return 0;
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 /* convert a TOK_IDENT to a keyword when needed */
 static void update_token_ident(JSParseState *s)
 {
@@ -20031,6 +21577,8 @@ static __exception int next_token(JSParseState *s)
     return -1;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 static int json_parse_error(JSParseState *s, const uint8_t *curp, const char *msg)
 {
     const uint8_t *p, *line_start;
@@ -20331,6 +21879,8 @@ static __exception int json_next_token(JSParseState *s)
     return -1;
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 /* only used for ':' and '=>', 'let' or 'function' look-ahead. *pp is
    only set if TOK_IMPORT is returned */
 /* XXX: handle all unicode cases */
@@ -23482,7 +25032,17 @@ static __exception int js_parse_postfix_expr(JSParseState *s, int parse_flags)
             if (JS_VALUE_GET_TAG(val) == JS_TAG_INT) {
                 emit_op(s, OP_push_i32);
                 emit_u32(s, JS_VALUE_GET_INT(val));
+            } else if (JS_VALUE_GET_TAG(val) == JS_TAG_SHORT_BIG_INT) {
+                int64_t v;
+                v = JS_VALUE_GET_SHORT_BIG_INT(val);
+                if (v >= INT32_MIN && v <= INT32_MAX) {
+                    emit_op(s, OP_push_bigint_i32);
+                    emit_u32(s, v);
+                } else {
+                    goto large_number;
+                }
             } else {
+            large_number:
                 if (emit_push_const(s, val, 0) < 0)
                     return -1;
             }
@@ -26237,6 +27797,8 @@ fail:
     return -1;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 /* 'name' is freed */
 static JSModuleDef *js_new_module_def(JSContext *ctx, JSAtom name)
 {
@@ -26322,6 +27884,8 @@ static void js_free_module_def(JSContext *ctx, JSModuleDef *m)
     js_free(ctx, m);
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 static int add_req_module_entry(JSContext *ctx, JSModuleDef *m,
                                 JSAtom module_name)
 {
@@ -26346,6 +27910,8 @@ static int add_req_module_entry(JSContext *ctx, JSModuleDef *m,
     return i;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 static JSExportEntry *find_export_entry(JSContext *ctx, const JSModuleDef *m,
                                         JSAtom export_name)
 {
@@ -26390,6 +27956,8 @@ static JSExportEntry *add_export_entry2(JSContext *ctx,
     return me;
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 static JSExportEntry *add_export_entry(JSParseState *s, JSModuleDef *m,
                                        JSAtom local_name, JSAtom export_name,
                                        JSExportTypeEnum export_type)
@@ -26413,6 +27981,8 @@ static int add_star_export_entry(JSContext *ctx, JSModuleDef *m,
     return 0;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 /* create a C module */
 /* `name_str` may be pure ASCII or UTF-8 encoded */
 JSModuleDef *JS_NewCModule(JSContext *ctx, const char *name_str,
@@ -28061,6 +29631,8 @@ static JSValue js_evaluate_module(JSContext *ctx, JSModuleDef *m)
     return js_dup(m->promise);
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 static __exception JSAtom js_parse_from_clause(JSParseState *s)
 {
     JSAtom module_name;
@@ -28500,6 +30072,8 @@ static JSFunctionDef *js_new_function_def(JSContext *ctx,
     return fd;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 static void free_bytecode_atoms(JSRuntime *rt,
                                 const uint8_t *bc_buf, int bc_len,
                                 bool use_short_opcodes)
@@ -28533,6 +30107,8 @@ static void free_bytecode_atoms(JSRuntime *rt,
     }
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 static void js_free_function_def(JSContext *ctx, JSFunctionDef *fd)
 {
     int i;
@@ -28595,6 +30171,8 @@ static void js_free_function_def(JSContext *ctx, JSFunctionDef *fd)
     js_free(ctx, fd);
 }
 
+#endif // QJS_DISABLE_PARSER
+
 #ifdef ENABLE_DUMPS // JS_DUMP_BYTECODE_*
 static const char *skip_lines(const char *p, int n) {
     while (p && n-- > 0 && *p) {
@@ -29051,6 +30629,8 @@ static __maybe_unused void js_dump_function_bytecode(JSContext *ctx, JSFunctionB
 }
 #endif
 
+#ifndef QJS_DISABLE_PARSER
+
 static int add_closure_var(JSContext *ctx, JSFunctionDef *s,
                            bool is_local, bool is_arg,
                            int var_idx, JSAtom var_name,
@@ -31544,6 +33124,28 @@ static __exception int resolve_labels(JSContext *ctx, JSFunctionDef *s)
             push_short_int(&bc_out, val);
             break;
 
+        case OP_push_bigint_i32:
+            {
+                /* transform i32(val) neg -> i32(-val) */
+                val = get_i32(bc_buf + pos + 1);
+                if (val != INT32_MIN
+                &&  code_match(&cc, pos_next, OP_neg, -1)) {
+                    if (cc.line_num >= 0) line_num = cc.line_num;
+                    if (cc.col_num >= 0) col_num = cc.col_num;
+                    if (code_match(&cc, cc.pos, OP_drop, -1)) {
+                        if (cc.line_num >= 0) line_num = cc.line_num;
+                        if (cc.col_num >= 0) col_num = cc.col_num;
+                    } else {
+                        add_pc2line_info(s, bc_out.size, line_num, col_num);
+                        dbuf_putc(&bc_out, OP_push_bigint_i32);
+                        dbuf_put_u32(&bc_out, -val);
+                    }
+                    pos_next = cc.pos;
+                    break;
+                }
+            }
+            goto no_change;
+
         case OP_push_const:
         case OP_fclosure:
             {
@@ -32336,8 +33938,10 @@ static JSValue js_create_function(JSContext *ctx, JSFunctionDef *fd)
        are used to compile the eval and they must be ordered by scope,
        so it is necessary to create the closure variables before any
        other variable lookup is done. */
+#ifndef QJS_DISABLE_PARSER
     if (fd->has_eval_call)
         add_eval_variables(ctx, fd);
+#endif // QJS_DISABLE_PARSER
 
     /* add the module global variables in the closure */
     if (fd->module) {
@@ -32496,6 +34100,8 @@ static JSValue js_create_function(JSContext *ctx, JSFunctionDef *fd)
     return JS_EXCEPTION;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 static void free_function_bytecode(JSRuntime *rt, JSFunctionBytecode *b)
 {
     int i;
@@ -32530,6 +34136,8 @@ static void free_function_bytecode(JSRuntime *rt, JSFunctionBytecode *b)
     }
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 static __exception int js_parse_directives(JSParseState *s)
 {
     char str[20];
@@ -33341,6 +34949,8 @@ static __exception int js_parse_program(JSParseState *s)
     return 0;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 static void js_parse_init(JSContext *ctx, JSParseState *s,
                           const char *input, size_t input_len,
                           const char *filename, int line)
@@ -33396,6 +35006,8 @@ JSValue JS_EvalFunction(JSContext *ctx, JSValue fun_obj)
     return JS_EvalFunctionInternal(ctx, fun_obj, ctx->global_obj, NULL, NULL);
 }
 
+#ifndef QJS_DISABLE_PARSER
+
 /* 'input' must be zero terminated i.e. input[input_len] = '\0'. */
 /* `export_name` and `input` may be pure ASCII or UTF-8 encoded */
 static JSValue __JS_EvalInternal(JSContext *ctx, JSValueConst this_obj,
@@ -33512,6 +35124,8 @@ static JSValue __JS_EvalInternal(JSContext *ctx, JSValueConst this_obj,
     return JS_EXCEPTION;
 }
 
+#endif // QJS_DISABLE_PARSER
+
 /* the indirection is needed to make 'eval' optional */
 static JSValue JS_EvalInternal(JSContext *ctx, JSValueConst this_obj,
                                const char *input, size_t input_len,
@@ -33748,7 +35362,7 @@ typedef enum BCTagEnum {
     BC_TAG_SYMBOL,
 } BCTagEnum;
 
-#define BC_VERSION 19
+#define BC_VERSION 20
 
 typedef struct BCWriterState {
     JSContext *ctx;
@@ -33798,6 +35412,13 @@ static const char * const bc_tag_str[] = {
     "Set",
     "Symbol",
 };
+
+static const char *bc_tag_name(uint8_t tag)
+{
+    if (tag >= countof(bc_tag_str))
+        return "<bad tag>";
+    return bc_tag_str[tag];
+}
 #endif
 
 static void bc_put_u8(BCWriterState *s, uint8_t v)
@@ -34018,71 +35639,44 @@ static void JS_WriteString(BCWriterState *s, JSString *p)
 
 static int JS_WriteBigInt(BCWriterState *s, JSValueConst obj)
 {
-    uint32_t tag, tag1;
-    int64_t e;
-    JSBigInt *bf = JS_VALUE_GET_PTR(obj);
-    bf_t *a = &bf->num;
-    size_t len, i, n1, j;
-    limb_t v;
+    JSBigIntBuf buf;
+    JSBigInt *p;
+    uint32_t len, i;
+    js_limb_t v, b;
+    int shift;
 
-    tag = JS_VALUE_GET_TAG(obj);
-    switch(tag) {
-    case JS_TAG_BIG_INT:
-        tag1 = BC_TAG_BIG_INT;
-        break;
-    default:
-        abort();
-    }
-    bc_put_u8(s, tag1);
+    bc_put_u8(s, BC_TAG_BIG_INT);
 
-    /* sign + exponent */
-    if (a->expn == BF_EXP_ZERO)
-        e = 0;
-    else if (a->expn == BF_EXP_INF)
-        e = 1;
-    else if (a->expn == BF_EXP_NAN)
-        e = 2;
-    else if (a->expn >= 0)
-        e = a->expn + 3;
+    if (JS_VALUE_GET_TAG(obj) == JS_TAG_SHORT_BIG_INT)
+        p = js_bigint_set_short(&buf, obj);
     else
-        e = a->expn;
-    e = (e * 2) | a->sign;
-    if (e < INT32_MIN || e > INT32_MAX) {
-        JS_ThrowRangeError(s->ctx, "maximum BigInt size exceeded");
-        return -1;
-    }
-    bc_put_sleb128(s, e);
-
-    /* mantissa */
-    if (a->len != 0) {
-        i = 0;
-        while (i < a->len && a->tab[i] == 0)
-            i++;
-        assert(i < a->len);
-        v = a->tab[i];
-        n1 = sizeof(limb_t);
-        while ((v & 0xff) == 0) {
-            n1--;
-            v >>= 8;
-        }
-        i++;
-        len = (a->len - i) * sizeof(limb_t) + n1;
-        if (len > INT32_MAX) {
-            JS_ThrowRangeError(s->ctx, "maximum BigInt size exceeded");
-            return -1;
+        p = JS_VALUE_GET_PTR(obj);
+    if (p->len == 1 && p->tab[0] == 0) {
+        /* zero case */
+        len = 0;
+    } else {
+        /* compute the length of the two's complement representation
+           in bytes */
+        len = p->len * (JS_LIMB_BITS / 8);
+        v = p->tab[p->len - 1];
+        shift = JS_LIMB_BITS - 8;
+        while (shift > 0) {
+            b = (v >> shift) & 0xff;
+            if (b != 0x00 && b != 0xff)
+                break;
+            if ((b & 1) != ((v >> (shift - 1)) & 1))
+                break;
+            shift -= 8;
+            len--;
         }
-        bc_put_leb128(s, len);
-        /* always saved in byte based little endian representation */
-        for(j = 0; j < n1; j++) {
-            bc_put_u8(s, v >> (j * 8));
+    }
+    bc_put_leb128(s, len);
+    if (len > 0) {
+        for(i = 0; i < (len / (JS_LIMB_BITS / 8)); i++) {
+            bc_put_u32(s, p->tab[i]);
         }
-        for(; i < a->len; i++) {
-            limb_t v = a->tab[i];
-#if LIMB_BITS == 32
-            bc_put_u32(s, v);
-#else
-            bc_put_u64(s, v);
-#endif
+        for(i = 0; i < len % (JS_LIMB_BITS / 8); i++) {
+            bc_put_u8(s, (p->tab[p->len - 1] >> (i * 8)) & 0xff);
         }
     }
     return 0;
@@ -34505,6 +36099,7 @@ static int JS_WriteObjectRec(BCWriterState *s, JSValueConst obj)
                 goto fail;
         }
         break;
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         if (JS_WriteBigInt(s, obj))
             goto fail;
@@ -34850,6 +36445,10 @@ static JSString *JS_ReadString(BCReaderState *s)
         return NULL;
     is_wide_char = len & 1;
     len >>= 1;
+    if (len > JS_STRING_LEN_MAX) {
+        JS_ThrowInternalError(s->ctx, "string too long");
+        return NULL;
+    }
     p = js_alloc_string(s->ctx, len, is_wide_char);
     if (!p) {
         s->error_state = -1;
@@ -34947,77 +36546,46 @@ static int JS_ReadFunctionBytecode(BCReaderState *s, JSFunctionBytecode *b,
 
 static JSValue JS_ReadBigInt(BCReaderState *s)
 {
-    JSValue obj;
+    JSValue obj = JS_UNDEFINED;
+    uint32_t len, i, n;
+    JSBigInt *p;
+    js_limb_t v;
     uint8_t v8;
-    int32_t e;
-    uint32_t len;
-    limb_t l, i, n;
-    limb_t v;
-    bf_t *a;
 
-    obj = JS_NewBigInt(s->ctx);
-    if (JS_IsException(obj))
+    if (bc_get_leb128(s, &len))
         goto fail;
-
-    /* sign + exponent */
-    if (bc_get_sleb128(s, &e))
+    bc_read_trace(s, "len=%" PRId64 "\n", (int64_t)len);
+    if (len == 0) {
+        /* zero case */
+        bc_read_trace(s, "}\n");
+        return __JS_NewShortBigInt(s->ctx, 0);
+    }
+    p = js_bigint_new(s->ctx, (len - 1) / (JS_LIMB_BITS / 8) + 1);
+    if (!p)
         goto fail;
-
-    a = JS_GetBigInt(obj);
-    a->sign = e & 1;
-    e >>= 1;
-    if (e == 0)
-        a->expn = BF_EXP_ZERO;
-    else if (e == 1)
-        a->expn = BF_EXP_INF;
-    else if (e == 2)
-        a->expn = BF_EXP_NAN;
-    else if (e >= 3)
-        a->expn = e - 3;
-    else
-        a->expn = e;
-
-    /* mantissa */
-    if (a->expn != BF_EXP_ZERO &&
-        a->expn != BF_EXP_INF &&
-        a->expn != BF_EXP_NAN) {
-        if (bc_get_leb128(s, &len))
-            goto fail;
-        bc_read_trace(s, "len=%" PRId64 "\n", (int64_t)len);
-        if (len == 0) {
-            JS_ThrowRangeError(s->ctx, "maximum BigInt size exceeded");
-            goto fail;
-        }
-        l = (len + sizeof(limb_t) - 1) / sizeof(limb_t);
-        if (bf_resize(a, l)) {
-            JS_ThrowOutOfMemory(s->ctx);
+    for(i = 0; i < len / (JS_LIMB_BITS / 8); i++) {
+        if (bc_get_u32(s, &v))
             goto fail;
-        }
-        n = len & (sizeof(limb_t) - 1);
-        if (n != 0) {
-            v = 0;
-            for(i = 0; i < n; i++) {
-                if (bc_get_u8(s, &v8))
-                    goto fail;
-                v |= (limb_t)v8 << ((sizeof(limb_t) - n + i) * 8);
-            }
-            a->tab[0] = v;
-            i = 1;
-        } else {
-            i = 0;
-        }
-        for(; i < l; i++) {
-#if LIMB_BITS == 32
-            if (bc_get_u32(s, &v))
-                goto fail;
-#else
-            if (bc_get_u64(s, &v))
+        p->tab[i] = v;
+    }
+    n = len % (JS_LIMB_BITS / 8);
+    if (n != 0) {
+        int shift;
+        v = 0;
+        for(i = 0; i < n; i++) {
+            if (bc_get_u8(s, &v8))
                 goto fail;
-#endif
-            a->tab[i] = v;
+            v |= (js_limb_t)v8 << (i * 8);
+        }
+        shift = JS_LIMB_BITS - n * 8;
+        /* extend the sign */
+        if (shift != 0) {
+            v = (js_slimb_t)(v << shift) >> shift;
         }
+        p->tab[p->len - 1] = v;
     }
-    return obj;
+    bc_read_trace(s, "}\n");
+    return JS_CompactBigInt(s->ctx, p);
  fail:
     JS_FreeValue(s->ctx, obj);
     return JS_EXCEPTION;
@@ -35707,7 +37275,7 @@ static JSValue JS_ReadObjectRec(BCReaderState *s)
     if (bc_get_u8(s, &tag))
         return JS_EXCEPTION;
 
-    bc_read_trace(s, "%s {\n", bc_tag_str[tag]);
+    bc_read_trace(s, "%s {\n", bc_tag_name(tag));
 
     switch(tag) {
     case BC_TAG_NULL:
@@ -36284,6 +37852,7 @@ JSValue JS_ToObject(JSContext *ctx, JSValueConst val)
     case JS_TAG_OBJECT:
     case JS_TAG_EXCEPTION:
         return js_dup(val);
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         obj = JS_NewObjectClass(ctx, JS_CLASS_BIG_INT);
         goto set_value;
@@ -37289,10 +38858,8 @@ static JSValue js_object_fromEntries(JSContext *ctx, JSValueConst this_val,
         item = JS_IteratorNext(ctx, iter, next_method, 0, NULL, &done);
         if (JS_IsException(item))
             goto fail;
-        if (done) {
-            JS_FreeValue(ctx, item);
+        if (done)
             break;
-        }
 
         key = JS_UNDEFINED;
         value = JS_UNDEFINED;
@@ -41056,13 +42623,18 @@ static JSValue js_number_constructor(JSContext *ctx, JSValueConst new_target,
         if (JS_IsException(val))
             return val;
         switch(JS_VALUE_GET_TAG(val)) {
+        case JS_TAG_SHORT_BIG_INT:
+            val = JS_NewInt64(ctx, JS_VALUE_GET_SHORT_BIG_INT(val));
+            if (JS_IsException(val))
+                return val;
+            break;
         case JS_TAG_BIG_INT:
             {
                 JSBigInt *p = JS_VALUE_GET_PTR(val);
                 double d;
-                bf_get_float64(&p->num, &d, BF_RNDN);
+                d = js_bigint_to_float64(ctx, p);
                 JS_FreeValue(ctx, val);
-                val = js_float64(d);
+                val = JS_NewFloat64(ctx, d);
             }
             break;
         default:
@@ -42555,7 +44127,7 @@ static JSValue js_string_pad(JSContext *ctx, JSValueConst this_val,
     }
     if (n > JS_STRING_LEN_MAX) {
         JS_ThrowRangeError(ctx, "invalid string length");
-        goto fail2;
+        goto fail3;
     }
     if (string_buffer_init(ctx, b, n))
         goto fail3;
@@ -43287,25 +44859,32 @@ static JSValue js_math_clz32(JSContext *ctx, JSValueConst this_val,
     return js_int32(r);
 }
 
+typedef enum SumPreciseStateEnum {
+    SUM_PRECISE_STATE_MINUS_ZERO,
+    SUM_PRECISE_STATE_NOT_A_NUMBER,
+    SUM_PRECISE_STATE_MINUS_INFINITY,
+    SUM_PRECISE_STATE_PLUS_INFINITY,
+    SUM_PRECISE_STATE_FINITE,
+} SumPreciseStateEnum;
+
 static JSValue js_math_sumPrecise(JSContext *ctx, JSValueConst this_val,
                                   int argc, JSValueConst *argv)
 {
     JSValue iter, next, item, ret;
-    bf_t a, b;
     int done;
     double d;
-    int r;
+    xsum_small_accumulator acc;
+    SumPreciseStateEnum state;
 
     iter = JS_GetIterator(ctx, argv[0], /*async*/false);
     if (JS_IsException(iter))
         return JS_EXCEPTION;
-    bf_init(ctx->bf_ctx, &a);
-    bf_init(ctx->bf_ctx, &b);
     ret = JS_EXCEPTION;
     next = JS_GetProperty(ctx, iter, JS_ATOM_next);
     if (JS_IsException(next))
         goto fail;
-    bf_set_zero(&a, /*is_neg*/true);
+    xsum_small_init(&acc);
+    state = SUM_PRECISE_STATE_MINUS_ZERO;
     for (;;) {
         item = JS_IteratorNext(ctx, iter, next, 0, NULL, &done);
         if (JS_IsException(item))
@@ -43324,25 +44903,52 @@ static JSValue js_math_sumPrecise(JSContext *ctx, JSValueConst this_val,
             d = JS_VALUE_GET_FLOAT64(item);
             break;
         }
-        if (bf_set_float64(&b, d))
-            goto oom;
-        // Infinity + -Infinity results in BF_ST_INVALID_OP, sets |a| to nan
-        if ((r = bf_add(&a, &a, &b, BF_PREC_INF, BF_RNDN)))
-            if (r != BF_ST_INVALID_OP)
-                goto oom;
+
+        if (state != SUM_PRECISE_STATE_NOT_A_NUMBER) {
+            if (isnan(d))
+                state = SUM_PRECISE_STATE_NOT_A_NUMBER;
+            else if (!isfinite(d) && d > 0.0)
+                if (state == SUM_PRECISE_STATE_MINUS_INFINITY)
+                    state = SUM_PRECISE_STATE_NOT_A_NUMBER;
+                else
+                    state = SUM_PRECISE_STATE_PLUS_INFINITY;
+            else if (!isfinite(d) && d < 0.0)
+                if (state == SUM_PRECISE_STATE_PLUS_INFINITY)
+                    state = SUM_PRECISE_STATE_NOT_A_NUMBER;
+                else
+                    state = SUM_PRECISE_STATE_MINUS_INFINITY;
+            else if (!(d == 0.0 && signbit(d)) && (state == SUM_PRECISE_STATE_MINUS_ZERO || state == SUM_PRECISE_STATE_FINITE)) {
+                state = SUM_PRECISE_STATE_FINITE;
+                xsum_small_add1(&acc, d);
+            }
+        }
+    }
+
+    switch (state) {
+    case SUM_PRECISE_STATE_NOT_A_NUMBER:
+        d = NAN;
+        break;
+    case SUM_PRECISE_STATE_MINUS_INFINITY:
+        d = -INFINITY;
+        break;
+    case SUM_PRECISE_STATE_PLUS_INFINITY:
+        d = INFINITY;
+        break;
+    case SUM_PRECISE_STATE_MINUS_ZERO:
+        d = -0.0;
+        break;
+    case SUM_PRECISE_STATE_FINITE:
+        d = xsum_small_round(&acc);
+        break;
+    default:
+        abort();
     }
-    bf_get_float64(&a, &d, BF_RNDN); // return value deliberately ignored
     ret = js_float64(d);
 fail:
     JS_IteratorClose(ctx, iter, JS_IsException(ret));
     JS_FreeValue(ctx, iter);
     JS_FreeValue(ctx, next);
-    bf_delete(&a);
-    bf_delete(&b);
     return ret;
-oom:
-    JS_ThrowOutOfMemory(ctx);
-    goto fail;
 }
 
 /* xorshift* random number generator by Marsaglia */
@@ -43965,6 +45571,14 @@ bool lre_check_stack_overflow(void *opaque, size_t alloca_size)
     return js_check_stack_overflow(ctx->rt, alloca_size);
 }
 
+int lre_check_timeout(void *opaque)
+{
+    JSContext *ctx = opaque;
+    JSRuntime *rt = ctx->rt;
+    return (rt->interrupt_handler &&
+            rt->interrupt_handler(rt, rt->interrupt_opaque));
+}
+
 #if 0
 void *lre_realloc(void *opaque, void *ptr, size_t size)
 {
@@ -44081,7 +45695,11 @@ static JSValue js_regexp_exec(JSContext *ctx, JSValueConst this_val,
                     goto fail;
             }
         } else {
-            JS_ThrowInternalError(ctx, "out of memory in regexp execution");
+            if (rc == LRE_RET_TIMEOUT) {
+                JS_ThrowInterrupted(ctx);
+            } else {
+                JS_ThrowInternalError(ctx, "out of memory in regexp execution");
+            }
             goto fail;
         }
     } else {
@@ -44276,7 +45894,11 @@ static JSValue JS_RegExpDelete(JSContext *ctx, JSValueConst this_val, JSValue ar
                         goto fail;
                 }
             } else {
-                JS_ThrowInternalError(ctx, "out of memory in regexp execution");
+                if (ret == LRE_RET_TIMEOUT) {
+                    JS_ThrowInterrupted(ctx);
+                } else {
+                    JS_ThrowInternalError(ctx, "out of memory in regexp execution");
+                }
                 goto fail;
             }
             break;
@@ -45422,6 +47044,7 @@ static JSValue js_json_check(JSContext *ctx, JSONStringifyContext *jsc,
     case JS_TAG_FLOAT64:
     case JS_TAG_BOOL:
     case JS_TAG_NULL:
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
     case JS_TAG_EXCEPTION:
         return val;
@@ -45452,6 +47075,11 @@ static int js_json_to_str(JSContext *ctx, JSONStringifyContext *jsc,
     tab = JS_UNDEFINED;
     prop = JS_UNDEFINED;
 
+    if (js_check_stack_overflow(ctx->rt, 0)) {
+        JS_ThrowStackOverflow(ctx);
+        goto exception;
+    }
+
     if (JS_IsObject(val)) {
         p = JS_VALUE_GET_OBJ(val);
         cl = p->class_id;
@@ -45599,6 +47227,7 @@ static int js_json_to_str(JSContext *ctx, JSONStringifyContext *jsc,
     case JS_TAG_NULL:
     concat_value:
         return string_buffer_concat_value_free(jsc->b, val);
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         JS_ThrowTypeError(ctx, "BigInt are forbidden in JSON.stringify");
         goto exception;
@@ -47050,26 +48679,6 @@ static const JSCFunctionListEntry js_symbol_funcs[] = {
 
 /* Set/Map/WeakSet/WeakMap */
 
-typedef struct JSMapRecord {
-    int ref_count; /* used during enumeration to avoid freeing the record */
-    bool empty; /* true if the record is deleted */
-    struct JSMapState *map;
-    struct list_head link;
-    struct list_head hash_link;
-    JSValue key;
-    JSValue value;
-} JSMapRecord;
-
-typedef struct JSMapState {
-    bool is_weak; /* true if WeakSet/WeakMap */
-    struct list_head records; /* list of JSMapRecord.link */
-    uint32_t record_count;
-    struct list_head *hash_table;
-    uint32_t hash_size; /* must be a power of two */
-    uint32_t record_count_threshold; /* count at which a hash table
-                                        resize is needed */
-} JSMapState;
-
 #define MAGIC_SET (1 << 0)
 #define MAGIC_WEAK (1 << 1)
 
@@ -47125,10 +48734,8 @@ static JSValue js_map_constructor(JSContext *ctx, JSValueConst new_target,
             item = JS_IteratorNext(ctx, iter, next_method, 0, NULL, &done);
             if (JS_IsException(item))
                 goto fail;
-            if (done) {
-                JS_FreeValue(ctx, item);
+            if (done)
                 break;
-            }
             if (is_set) {
                 ret = JS_Call(ctx, adder, obj, 1, vc(&item));
                 if (JS_IsException(ret)) {
@@ -47206,7 +48813,7 @@ static uint32_t map_hash_key(JSContext *ctx, JSValueConst key)
     uint32_t h;
     double d;
     JSFloat64Union u;
-    bf_t *a;
+    JSBigInt *r;
 
     switch(tag) {
     case JS_TAG_BOOL:
@@ -47222,9 +48829,12 @@ static uint32_t map_hash_key(JSContext *ctx, JSValueConst key)
     case JS_TAG_INT:
         d = JS_VALUE_GET_INT(key);
         goto hash_float64;
+    case JS_TAG_SHORT_BIG_INT:
+        d = JS_VALUE_GET_SHORT_BIG_INT(key);
+        goto hash_float64;
     case JS_TAG_BIG_INT:
-        a = JS_GetBigInt(key);
-        h = hash_string8((void *)a->tab, a->len * sizeof(*a->tab), 0);
+        r = JS_VALUE_GET_PTR(key);
+        h = hash_string8((void *)r->tab, r->len * sizeof(*r->tab), 0);
         break;
     case JS_TAG_FLOAT64:
         d = JS_VALUE_GET_FLOAT64(key);
@@ -47261,7 +48871,6 @@ static JSMapRecord *map_find_record(JSContext *ctx, JSMapState *s,
 static void map_hash_resize(JSContext *ctx, JSMapState *s)
 {
     uint32_t new_hash_size, i, h;
-    size_t slack;
     struct list_head *new_hash_table, *el;
     JSMapRecord *mr;
 
@@ -47270,11 +48879,10 @@ static void map_hash_resize(JSContext *ctx, JSMapState *s)
         new_hash_size = 4;
     else
         new_hash_size = s->hash_size * 2;
-    new_hash_table = js_realloc2(ctx, s->hash_table,
-                                 sizeof(new_hash_table[0]) * new_hash_size, &slack);
+    new_hash_table = js_realloc(ctx, s->hash_table,
+                                sizeof(new_hash_table[0]) * new_hash_size);
     if (!new_hash_table)
         return;
-    new_hash_size += slack / sizeof(*new_hash_table);
 
     for(i = 0; i < new_hash_size; i++)
         init_list_head(&new_hash_table[i]);
@@ -47674,10 +49282,10 @@ static void js_map_mark(JSRuntime *rt, JSValueConst val,
 
     s = p->u.map_state;
     if (s) {
+        assert(!s->is_weak);
         list_for_each(el, &s->records) {
             mr = list_entry(el, JSMapRecord, link);
-            if (!s->is_weak)
-                JS_MarkValue(rt, mr->key, mark_func);
+            JS_MarkValue(rt, mr->key, mark_func);
             JS_MarkValue(rt, mr->value, mark_func);
         }
     }
@@ -48741,6 +50349,38 @@ static JSValue promise_reaction_job(JSContext *ctx, int argc,
     return res2;
 }
 
+static JSValue promise_rejection_tracker_job(JSContext *ctx, int argc,
+                                             JSValueConst *argv)
+{
+    JSRuntime *rt;
+    JSPromiseData *s;
+    JSValueConst promise;
+
+    assert(argc == 1);
+
+    rt = ctx->rt;
+    promise = argv[0];
+    s = JS_GetOpaque(promise, JS_CLASS_PROMISE);
+
+    if (!s || s->promise_state != JS_PROMISE_REJECTED)
+        return JS_UNDEFINED; /* should never happen */
+
+    promise_trace(ctx, "promise_rejection_tracker_job\n");
+
+    // Check again in case the hook was removed.
+    if (rt->host_promise_rejection_tracker)
+        rt->host_promise_rejection_tracker(
+            ctx, promise, s->promise_result, s->is_handled, rt->host_promise_rejection_tracker_opaque);
+
+    return JS_UNDEFINED;
+}
+
+void JS_SetPromiseHook(JSRuntime *rt, JSPromiseHook promise_hook, void *opaque)
+{
+    rt->promise_hook = promise_hook;
+    rt->promise_hook_opaque = opaque;
+}
+
 void JS_SetHostPromiseRejectionTracker(JSRuntime *rt,
                                        JSHostPromiseRejectionTracker *cb,
                                        void *opaque)
@@ -48764,11 +50404,11 @@ static void fulfill_or_reject_promise(JSContext *ctx, JSValueConst promise,
 
     promise_trace(ctx, "fulfill_or_reject_promise: is_reject=%d\n", is_reject);
 
-    if (s->promise_state == JS_PROMISE_REJECTED && !s->is_handled) {
+    if (s->promise_state == JS_PROMISE_FULFILLED) {
         JSRuntime *rt = ctx->rt;
-        if (rt->host_promise_rejection_tracker) {
-            rt->host_promise_rejection_tracker(ctx, promise, value, false,
-                                               rt->host_promise_rejection_tracker_opaque);
+        if (rt->promise_hook) {
+            rt->promise_hook(ctx, JS_PROMISE_HOOK_RESOLVE, promise,
+                             JS_UNDEFINED, rt->promise_hook_opaque);
         }
     }
 
@@ -48789,12 +50429,12 @@ static void fulfill_or_reject_promise(JSContext *ctx, JSValueConst promise,
         list_del(&rd->link);
         promise_reaction_data_free(ctx->rt, rd);
     }
-}
 
-static void reject_promise(JSContext *ctx, JSValueConst promise,
-                           JSValueConst value)
-{
-    fulfill_or_reject_promise(ctx, promise, value, true);
+    if (s->promise_state == JS_PROMISE_REJECTED && !s->is_handled) {
+        JSRuntime *rt = ctx->rt;
+        if (rt->host_promise_rejection_tracker)
+            JS_EnqueueJob(ctx, promise_rejection_tracker_job, 1, &promise);
+    }
 }
 
 static JSValue js_promise_resolve_thenable_job(JSContext *ctx,
@@ -48802,6 +50442,7 @@ static JSValue js_promise_resolve_thenable_job(JSContext *ctx,
 {
     JSValueConst promise, thenable, then;
     JSValue args[2], res;
+    JSRuntime *rt;
 
     promise_trace(ctx, "js_promise_resolve_thenable_job\n");
 
@@ -48811,7 +50452,16 @@ static JSValue js_promise_resolve_thenable_job(JSContext *ctx,
     then = argv[2];
     if (js_create_resolving_functions(ctx, args, promise) < 0)
         return JS_EXCEPTION;
+    rt = ctx->rt;
+    if (rt->promise_hook) {
+        rt->promise_hook(ctx, JS_PROMISE_HOOK_BEFORE, promise, JS_UNDEFINED,
+                         rt->promise_hook_opaque);
+    }
     res = JS_Call(ctx, then, thenable, 2, vc(args));
+    if (rt->promise_hook) {
+        rt->promise_hook(ctx, JS_PROMISE_HOOK_AFTER, promise, JS_UNDEFINED,
+                         rt->promise_hook_opaque);
+    }
     if (JS_IsException(res)) {
         JSValue error = JS_GetException(ctx);
         res = JS_Call(ctx, args[1], JS_UNDEFINED, 1, vc(&error));
@@ -48932,7 +50582,7 @@ static JSValue js_promise_resolve_function_call(JSContext *ctx,
         JSValue error;
     fail_reject:
         error = JS_GetException(ctx);
-        reject_promise(ctx, s->promise, error);
+        fulfill_or_reject_promise(ctx, s->promise, error, true);
         JS_FreeValue(ctx, error);
     } else if (!JS_IsFunction(ctx, then)) {
         JS_FreeValue(ctx, then);
@@ -48994,6 +50644,7 @@ static JSValue js_promise_constructor(JSContext *ctx, JSValueConst new_target,
     JSValueConst executor;
     JSValue obj;
     JSPromiseData *s;
+    JSRuntime *rt;
     JSValue args[2], ret;
     int i;
 
@@ -49014,6 +50665,14 @@ static JSValue js_promise_constructor(JSContext *ctx, JSValueConst new_target,
     JS_SetOpaqueInternal(obj, s);
     if (js_create_resolving_functions(ctx, args, obj))
         goto fail;
+    rt = ctx->rt;
+    if (rt->promise_hook) {
+        JSValueConst parent_promise = JS_UNDEFINED;
+        if (rt->parent_promise)
+            parent_promise = rt->parent_promise->value;
+        rt->promise_hook(ctx, JS_PROMISE_HOOK_INIT, obj, parent_promise,
+                         rt->promise_hook_opaque);
+    }
     ret = JS_Call(ctx, executor, JS_UNDEFINED, 2, vc(args));
     if (JS_IsException(ret)) {
         JSValue ret2, error;
@@ -49071,8 +50730,7 @@ static JSValue js_new_promise_capability(JSContext *ctx,
 
     executor = js_promise_executor_new(ctx);
     if (JS_IsException(executor))
-        return executor;
-
+        return JS_EXCEPTION;
     if (JS_IsUndefined(ctor)) {
         result_promise = js_promise_constructor(ctx, ctor, 1, vc(&executor));
     } else {
@@ -49523,10 +51181,8 @@ static __exception int perform_promise_then(JSContext *ctx,
         JSValueConst args[5];
         if (s->promise_state == JS_PROMISE_REJECTED && !s->is_handled) {
             JSRuntime *rt = ctx->rt;
-            if (rt->host_promise_rejection_tracker) {
-                rt->host_promise_rejection_tracker(ctx, promise, s->promise_result,
-                                                   true, rt->host_promise_rejection_tracker_opaque);
-            }
+            if (rt->host_promise_rejection_tracker)
+                JS_EnqueueJob(ctx, promise_rejection_tracker_job, 1, &promise);
         }
         i = s->promise_state - JS_PROMISE_FULFILLED;
         rd = rd_array[i];
@@ -49547,7 +51203,10 @@ static JSValue js_promise_then(JSContext *ctx, JSValueConst this_val,
                                int argc, JSValueConst *argv)
 {
     JSValue ctor, result_promise, resolving_funcs[2];
+    bool have_promise_hook;
+    JSValueLink link;
     JSPromiseData *s;
+    JSRuntime *rt;
     int i, ret;
 
     s = JS_GetOpaque2(ctx, this_val, JS_CLASS_PROMISE);
@@ -49557,7 +51216,16 @@ static JSValue js_promise_then(JSContext *ctx, JSValueConst this_val,
     ctor = JS_SpeciesConstructor(ctx, this_val, JS_UNDEFINED);
     if (JS_IsException(ctor))
         return ctor;
+    rt = ctx->rt;
+    // always restore, even if js_new_promise_capability callee removes hook
+    have_promise_hook = (rt->promise_hook != NULL);
+    if (have_promise_hook) {
+        link = (JSValueLink){rt->parent_promise, this_val};
+        rt->parent_promise = &link;
+    }
     result_promise = js_new_promise_capability(ctx, resolving_funcs, ctor);
+    if (have_promise_hook)
+        rt->parent_promise = link.next;
     JS_FreeValue(ctx, ctor);
     if (JS_IsException(result_promise))
         return result_promise;
@@ -50804,6 +52472,9 @@ static bool string_get_digits(const uint8_t *sp, int *pp, int *pval,
 
     p_start = p;
     while ((c = sp[p]) >= '0' && c <= '9') {
+        /* arbitrary limit to 9 digits */
+        if (v >= 100000000)
+            return false;
         v = v * 10 + c - '0';
         p++;
         if (p - p_start == max_digits)
@@ -50847,7 +52518,7 @@ static bool string_get_tzoffset(const uint8_t *sp, int *pp, int *tzp, bool stric
     sgn = sp[p++];
     if (sgn == '+' || sgn == '-') {
         int n = p;
-        if (!string_get_digits(sp, &p, &hh, 1, 9))
+        if (!string_get_digits(sp, &p, &hh, 1, 0))
             return false;
         n = p - n;
         if (strict && n != 2 && n != 4)
@@ -51041,7 +52712,7 @@ static bool js_date_parse_otherstring(const uint8_t *sp,
                 *is_local = false;
             } else {
                 p++;
-                if (string_get_digits(sp, &p, &val, 1, 9)) {
+                if (string_get_digits(sp, &p, &val, 1, 0)) {
                     if (c == '-') {
                         if (val == 0)
                             return false;
@@ -51052,7 +52723,7 @@ static bool js_date_parse_otherstring(const uint8_t *sp,
                 }
             }
         } else
-        if (string_get_digits(sp, &p, &val, 1, 9)) {
+        if (string_get_digits(sp, &p, &val, 1, 0)) {
             if (string_skip_char(sp, &p, ':')) {
                 /* time part */
                 fields[3] = val;
@@ -51444,7 +53115,9 @@ void JS_AddIntrinsicDate(JSContext *ctx)
 
 void JS_AddIntrinsicEval(JSContext *ctx)
 {
+#ifndef QJS_DISABLE_PARSER
     ctx->eval_internal = __JS_EvalInternal;
+#endif // QJS_DISABLE_PARSER
 }
 
 /* BigInt */
@@ -51460,40 +53133,25 @@ static JSValue JS_ToBigIntCtorFree(JSContext *ctx, JSValue val)
     case JS_TAG_BOOL:
         val = JS_NewBigInt64(ctx, JS_VALUE_GET_INT(val));
         break;
+    case JS_TAG_SHORT_BIG_INT:
     case JS_TAG_BIG_INT:
         break;
     case JS_TAG_FLOAT64:
         {
-            bf_t *a, a_s;
-
-            a = JS_ToBigInt1(ctx, &a_s, val);
-            if (!bf_is_finite(a)) {
-                JS_FreeValue(ctx, val);
-                val = JS_ThrowRangeError(ctx, "cannot convert NaN or Infinity to BigInt");
-            } else {
-                JSValue val1 = JS_NewBigInt(ctx);
-                bf_t *r;
-                int ret;
-                if (JS_IsException(val1)) {
-                    JS_FreeValue(ctx, val);
-                    return JS_EXCEPTION;
-                }
-                r = JS_GetBigInt(val1);
-                ret = bf_set(r, a);
-                ret |= bf_rint(r, BF_RNDZ);
-                JS_FreeValue(ctx, val);
-                if (ret & BF_ST_MEM_ERROR) {
-                    JS_FreeValue(ctx, val1);
-                    val = JS_ThrowOutOfMemory(ctx);
-                } else if (ret & BF_ST_INEXACT) {
-                    JS_FreeValue(ctx, val1);
+            double d = JS_VALUE_GET_FLOAT64(val);
+            JSBigInt *r;
+            int res;
+            r = js_bigint_from_float64(ctx, &res, d);
+            if (!r) {
+                if (res == 0) {
+                    val = JS_EXCEPTION;
+                } else if (res == 1) {
                     val = JS_ThrowRangeError(ctx, "cannot convert to BigInt: not an integer");
                 } else {
-                    val = JS_CompactBigInt(ctx, val1);
-                }
+                    val = JS_ThrowRangeError(ctx, "cannot convert NaN or Infinity to BigInt");                }
+            } else {
+                val = JS_CompactBigInt(ctx, r);
             }
-            if (a == &a_s)
-                bf_delete(a);
         }
         break;
     case JS_TAG_STRING:
@@ -51573,38 +53231,62 @@ static JSValue js_bigint_asUintN(JSContext *ctx,
                                   int argc, JSValueConst *argv, int asIntN)
 {
     uint64_t bits;
-    bf_t a_s, *a = &a_s, *r, mask_s, *mask = &mask_s;
-    JSValue res;
+    JSValue res, a;
 
     if (JS_ToIndex(ctx, &bits, argv[0]))
         return JS_EXCEPTION;
-    res = JS_NewBigInt(ctx);
-    if (JS_IsException(res))
-        return JS_EXCEPTION;
-    a = JS_ToBigInt(ctx, &a_s, argv[1]);
-    if (!a) {
-        JS_FreeValue(ctx, res);
+    a = JS_ToBigInt(ctx, argv[1]);
+    if (JS_IsException(a))
         return JS_EXCEPTION;
+    if (bits == 0) {
+        JS_FreeValue(ctx, a);
+        res = __JS_NewShortBigInt(ctx, 0);
+    } else if (JS_VALUE_GET_TAG(a) == JS_TAG_SHORT_BIG_INT) {
+        /* fast case */
+        if (bits >= JS_SHORT_BIG_INT_BITS) {
+            res = a;
+        } else {
+            uint64_t v;
+            int shift;
+            shift = 64 - bits;
+            v = JS_VALUE_GET_SHORT_BIG_INT(a);
+            v = v << shift;
+            if (asIntN)
+                v = (int64_t)v >> shift;
+            else
+                v = v >> shift;
+            res = __JS_NewShortBigInt(ctx, v);
+        }
+    } else {
+        JSBigInt *r, *p = JS_VALUE_GET_PTR(a);
+        if (bits >= p->len * JS_LIMB_BITS) {
+            res = a;
+        } else {
+            int len, shift, i;
+            js_limb_t v;
+            len = (bits + JS_LIMB_BITS - 1) / JS_LIMB_BITS;
+            r = js_bigint_new(ctx, len);
+            if (!r) {
+                JS_FreeValue(ctx, a);
+                return JS_EXCEPTION;
+            }
+            r->len = len;
+            for(i = 0; i < len - 1; i++)
+                r->tab[i] = p->tab[i];
+            shift = (-bits) & (JS_LIMB_BITS - 1);
+            /* 0 <= shift <= JS_LIMB_BITS - 1 */
+            v = p->tab[len - 1] << shift;
+            if (asIntN)
+                v = (js_slimb_t)v >> shift;
+            else
+                v = v >> shift;
+            r->tab[len - 1] = v;
+            r = js_bigint_normalize(ctx, r);
+            JS_FreeValue(ctx, a);
+            res = JS_CompactBigInt(ctx, r);
+        }
     }
-    /* XXX: optimize */
-    r = JS_GetBigInt(res);
-    bf_init(ctx->bf_ctx, mask);
-    bf_set_ui(mask, 1);
-    bf_mul_2exp(mask, bits, BF_PREC_INF, BF_RNDZ);
-    bf_add_si(mask, mask, -1, BF_PREC_INF, BF_RNDZ);
-    bf_logic_and(r, a, mask);
-    if (asIntN && bits != 0) {
-        bf_set_ui(mask, 1);
-        bf_mul_2exp(mask, bits - 1, BF_PREC_INF, BF_RNDZ);
-        if (bf_cmpu(r, mask) >= 0) {
-            bf_set_ui(mask, 1);
-            bf_mul_2exp(mask, bits, BF_PREC_INF, BF_RNDZ);
-            bf_sub(r, r, mask, BF_PREC_INF, BF_RNDZ);
-        }
-    }
-    bf_delete(mask);
-    JS_FreeBigInt(ctx, a, &a_s);
-    return JS_CompactBigInt(ctx, res);
+    return res;
 }
 
 static const JSCFunctionListEntry js_bigint_funcs[] = {
@@ -51778,6 +53460,10 @@ void JS_AddIntrinsicBaseObjects(JSContext *ctx)
     ctx->array_ctor = js_dup(obj);
     JS_SetPropertyFunctionList(ctx, obj, js_array_funcs,
                                countof(js_array_funcs));
+    JS_DefineAutoInitProperty(ctx, obj, JS_ATOM_fromAsync,
+                              JS_AUTOINIT_ID_BYTECODE,
+                              (void *)(uintptr_t)JS_BUILTIN_ARRAY_FROMASYNC,
+                              JS_PROP_WRITABLE|JS_PROP_CONFIGURABLE);
 
     /* XXX: create auto_initializer */
     {
@@ -52386,8 +54072,12 @@ static JSValue js_array_buffer_resize(JSContext *ctx, JSValueConst this_val,
     list_for_each(el, &abuf->array_list) {
         ta = list_entry(el, JSTypedArray, link);
         p = ta->obj;
-        if (p->class_id == JS_CLASS_DATAVIEW)
+        if (p->class_id == JS_CLASS_DATAVIEW) {
+            if (ta->track_rab && ta->offset < len)
+                ta->length = len - ta->offset;
+
             continue;
+        }
         p->u.array.count = 0;
         p->u.array.u.ptr = NULL;
         size_log2 = typed_array_size_log2(p->class_id);
@@ -53340,19 +55030,33 @@ static JSValue js_typed_array_indexOf(JSContext *ctx, JSValueConst this_val,
             v64 = d;
             is_int = (v64 == d);
         }
-    } else
-    if (tag == JS_TAG_BIG_INT) {
-        JSBigInt *p1 = JS_VALUE_GET_PTR(argv[0]);
+    } else if (tag == JS_TAG_BIG_INT || tag == JS_TAG_SHORT_BIG_INT) {
+        JSBigIntBuf buf1;
+        JSBigInt *p1;
+        int sz = (64 / JS_LIMB_BITS);
+        if (tag == JS_TAG_SHORT_BIG_INT)
+            p1 = js_bigint_set_short(&buf1, argv[0]);
+        else
+            p1 = JS_VALUE_GET_PTR(argv[0]);
 
         if (p->class_id == JS_CLASS_BIG_INT64_ARRAY) {
-            if (bf_get_int64(&v64, &p1->num, 0) != 0)
-                goto done;
+            if (p1->len > sz)
+                goto done; /* does not fit an int64 : cannot be found */
         } else if (p->class_id == JS_CLASS_BIG_UINT64_ARRAY) {
-            if (bf_get_uint64((uint64_t *)&v64, &p1->num) != 0)
+            if (js_bigint_sign(p1))
+                goto done; /* v < 0 */
+            if (p1->len <= sz) {
+                /* OK */
+            } else if (p1->len == sz + 1 && p1->tab[sz] == 0) {
+                /* 2^63 <= v <= 2^64-1 */
+            } else {
                 goto done;
+            }
         } else {
             goto done;
         }
+        if (JS_ToBigInt64(ctx, &v64, argv[0]))
+            goto exception;
         d = 0;
         is_bigint = 1;
     } else {
@@ -54259,10 +55963,8 @@ static JSValue js_array_from_iterator(JSContext *ctx, uint32_t *plen,
         val = JS_IteratorNext(ctx, iter, next_method, 0, NULL, &done);
         if (JS_IsException(val))
             goto fail;
-        if (done) {
-            JS_FreeValue(ctx, val);
+        if (done)
             break;
-        }
         if (JS_CreateDataPropertyUint32(ctx, arr, k, val, JS_PROP_THROW) < 0)
             goto fail;
         k++;
@@ -54776,8 +56478,7 @@ static JSValue js_dataview_setValue(JSContext *ctx,
     if (class_id <= JS_CLASS_UINT32_ARRAY) {
         if (JS_ToUint32(ctx, &v, val))
             return JS_EXCEPTION;
-    } else
-    if (class_id <= JS_CLASS_BIG_UINT64_ARRAY) {
+    } else if (class_id <= JS_CLASS_BIG_UINT64_ARRAY) {
         if (JS_ToBigInt64(ctx, (int64_t *)&v64, val))
             return JS_EXCEPTION;
     } else {
@@ -55156,7 +56857,7 @@ static JSValue js_atomics_store(JSContext *ctx,
         return JS_EXCEPTION;
     if (size_log2 == 3) {
         int64_t v64;
-        ret = JS_ToBigIntValueFree(ctx, js_dup(argv[2]));
+        ret = JS_ToBigIntFree(ctx, js_dup(argv[2]));
         if (JS_IsException(ret))
             return ret;
         if (JS_ToBigInt64(ctx, &v64, ret)) {
@@ -55846,6 +57547,7 @@ static void reset_weak_ref(JSRuntime *rt, JSWeakRefRecord **first_weak_ref)
             assert(!mr->empty); /* no iterator on WeakMap/WeakSet */
             list_del(&mr->hash_link);
             list_del(&mr->link);
+            s->record_count--;
             break;
         case JS_WEAK_REF_KIND_WEAK_REF:
             wrd = wr->u.weak_ref_data;
@@ -56082,6 +57784,7 @@ static void _JS_AddIntrinsicCallSite(JSContext *ctx)
 
 bool JS_DetectModule(const char *input, size_t input_len)
 {
+#ifndef QJS_DISABLE_PARSER
     JSRuntime *rt;
     JSContext *ctx;
     JSValue val;
@@ -56110,6 +57813,9 @@ bool JS_DetectModule(const char *input, size_t input_len)
     JS_FreeContext(ctx);
     JS_FreeRuntime(rt);
     return is_module;
+#else
+    return false;
+#endif // QJS_DISABLE_PARSER
 }
 
 uintptr_t js_std_cmd(int cmd, ...) {
diff --git a/lib/monoucha0/monoucha/qjs/quickjs.h b/lib/monoucha0/monoucha/qjs/quickjs.h
index d1530db9..8fa81616 100644
--- a/lib/monoucha0/monoucha/qjs/quickjs.h
+++ b/lib/monoucha0/monoucha/qjs/quickjs.h
@@ -102,7 +102,8 @@ enum {
     JS_TAG_UNINITIALIZED = 4,
     JS_TAG_CATCH_OFFSET = 5,
     JS_TAG_EXCEPTION   = 6,
-    JS_TAG_FLOAT64     = 7,
+    JS_TAG_SHORT_BIG_INT = 7,
+    JS_TAG_FLOAT64     = 8,
     /* any larger tag is FLOAT64 if JS_NAN_BOXING */
 };
 
@@ -136,6 +137,7 @@ typedef const struct JSValue *JSValueConst;
 #define JS_MKPTR(tag, ptr)       ((JSValue)((tag) | (intptr_t)(ptr)))
 #define JS_VALUE_GET_NORM_TAG(v) ((int)((intptr_t)(v) & 15))
 #define JS_VALUE_GET_TAG(v)      ((int)((intptr_t)(v) & 15))
+#define JS_VALUE_GET_SHORT_BIG_INT(v) JS_VALUE_GET_INT(v)
 #define JS_VALUE_GET_PTR(v)      ((void *)((intptr_t)(v) & ~15))
 #define JS_VALUE_GET_INT(v)      ((int)((intptr_t)(v) >> 4))
 #define JS_VALUE_GET_BOOL(v)     ((int)((intptr_t)(v) >> 4))
@@ -148,6 +150,12 @@ static inline JSValue __JS_NewFloat64(double d)
     return JS_MKVAL(JS_TAG_FLOAT64, (int)d);
 }
 
+static inline JSValue __JS_NewShortBigInt(JSContext *ctx, int32_t d)
+{
+    (void)&ctx;
+    return JS_MKVAL(JS_TAG_SHORT_BIG_INT, d);
+}
+
 static inline bool JS_VALUE_IS_NAN(JSValue v)
 {
     (void)&v;
@@ -161,6 +169,7 @@ typedef uint64_t JSValue;
 #define JS_VALUE_GET_TAG(v) (int)((v) >> 32)
 #define JS_VALUE_GET_INT(v) (int)(v)
 #define JS_VALUE_GET_BOOL(v) (int)(v)
+#define JS_VALUE_GET_SHORT_BIG_INT(v) (int)(v)
 #define JS_VALUE_GET_PTR(v) (void *)(intptr_t)(v)
 
 #define JS_MKVAL(tag, val) (((uint64_t)(tag) << 32) | (uint32_t)(val))
@@ -197,6 +206,12 @@ static inline JSValue __JS_NewFloat64(double d)
     return v;
 }
 
+static inline JSValue __JS_NewShortBigInt(JSContext *ctx, int32_t d)
+{
+    (void)&ctx;
+    return JS_MKVAL(JS_TAG_SHORT_BIG_INT, d);
+}
+
 #define JS_TAG_IS_FLOAT64(tag) ((unsigned)((tag) - JS_TAG_FIRST) >= (JS_TAG_FLOAT64 - JS_TAG_FIRST))
 
 /* same as JS_VALUE_GET_TAG, but return JS_TAG_FLOAT64 with NaN boxing */
@@ -223,6 +238,7 @@ typedef union JSValueUnion {
     int32_t int32;
     double float64;
     void *ptr;
+    int32_t short_big_int;
 } JSValueUnion;
 
 typedef struct JSValue {
@@ -236,6 +252,7 @@ typedef struct JSValue {
 #define JS_VALUE_GET_INT(v) ((v).u.int32)
 #define JS_VALUE_GET_BOOL(v) ((v).u.int32)
 #define JS_VALUE_GET_FLOAT64(v) ((v).u.float64)
+#define JS_VALUE_GET_SHORT_BIG_INT(v) ((v).u.short_big_int)
 #define JS_VALUE_GET_PTR(v) ((v).u.ptr)
 
 /* msvc doesn't understand designated initializers without /std:c++20 */
@@ -281,6 +298,15 @@ static inline JSValue __JS_NewFloat64(double d)
     return v;
 }
 
+static inline JSValue __JS_NewShortBigInt(JSContext *ctx, int64_t d)
+{
+    (void)&ctx;
+    JSValue v;
+    v.tag = JS_TAG_SHORT_BIG_INT;
+    v.u.short_big_int = d;
+    return v;
+}
+
 static inline bool JS_VALUE_IS_NAN(JSValue v)
 {
     union {
@@ -671,7 +697,8 @@ static inline bool JS_IsNumber(JSValueConst v)
 static inline bool JS_IsBigInt(JSContext *ctx, JSValueConst v)
 {
     (void)&ctx;
-    return JS_VALUE_GET_TAG(v) == JS_TAG_BIG_INT;
+    int tag = JS_VALUE_GET_TAG(v);
+    return tag == JS_TAG_BIG_INT || tag == JS_TAG_SHORT_BIG_INT;
 }
 
 static inline bool JS_IsBool(JSValueConst v)
@@ -767,6 +794,10 @@ JS_EXTERN JSValue JS_NewStringLen(JSContext *ctx, const char *str1, size_t len1)
 static inline JSValue JS_NewString(JSContext *ctx, const char *str) {
     return JS_NewStringLen(ctx, str, strlen(str));
 }
+// makes a copy of the input; does not check if the input is valid UTF-16,
+// that is the responsibility of the caller
+JS_EXTERN JSValue JS_NewTwoByteString(JSContext *ctx, const uint16_t *buf,
+                                      size_t len);
 JS_EXTERN JSValue JS_NewAtomString(JSContext *ctx, const char *str);
 JS_EXTERN JSValue JS_ToString(JSContext *ctx, JSValueConst val);
 JS_EXTERN JSValue JS_ToPropertyKey(JSContext *ctx, JSValueConst val);
@@ -886,6 +917,7 @@ JS_EXTERN JSValue JS_CallConstructor2(JSContext *ctx, JSValueConst func_obj,
  * wholly infallible: non-strict classic scripts may _parse_ okay as a module
  * but not _execute_ as one (different runtime semantics.) Use with caution.
  * |input| can be either ASCII or UTF-8 encoded source code.
+ * Returns false if QuickJS was built with -DQJS_DISABLE_PARSER.
  */
 JS_EXTERN bool JS_DetectModule(const char *input, size_t input_len);
 /* 'input' must be zero terminated i.e. input[input_len] = '\0'. */
@@ -985,6 +1017,23 @@ JS_EXTERN bool JS_IsPromise(JSValueConst val);
 
 JS_EXTERN JSValue JS_NewSymbol(JSContext *ctx, const char *description, bool is_global);
 
+typedef enum JSPromiseHookType {
+    JS_PROMISE_HOOK_INIT,     // emitted when a new promise is created
+    JS_PROMISE_HOOK_BEFORE,   // runs right before promise.then is invoked
+    JS_PROMISE_HOOK_AFTER,    // runs right after promise.then is invoked
+    JS_PROMISE_HOOK_RESOLVE,  // not emitted for rejected promises
+} JSPromiseHookType;
+
+// parent_promise is only passed in when type == JS_PROMISE_HOOK_INIT and
+// is then either a promise object or JS_UNDEFINED if the new promise does
+// not have a parent promise; only promises created with promise.then have
+// a parent promise
+typedef void JSPromiseHook(JSContext *ctx, JSPromiseHookType type,
+                           JSValueConst promise, JSValueConst parent_promise,
+                           void *opaque);
+JS_EXTERN void JS_SetPromiseHook(JSRuntime *rt, JSPromiseHook promise_hook,
+                                 void *opaque);
+
 /* is_handled = true means that the rejection is handled */
 typedef void JSHostPromiseRejectionTracker(JSContext *ctx, JSValueConst promise,
                                            JSValueConst reason,
@@ -1206,14 +1255,14 @@ JS_EXTERN int JS_AddModuleExportList(JSContext *ctx, JSModuleDef *m,
                                       const JSCFunctionListEntry *tab, int len);
 /* can only be called after the module is instantiated */
 JS_EXTERN int JS_SetModuleExport(JSContext *ctx, JSModuleDef *m, const char *export_name,
-                                 JSValueConst val);
+                                 JSValue val);
 JS_EXTERN int JS_SetModuleExportList(JSContext *ctx, JSModuleDef *m,
                                      const JSCFunctionListEntry *tab, int len);
 
 /* Version */
 
 #define QJS_VERSION_MAJOR 0
-#define QJS_VERSION_MINOR 9
+#define QJS_VERSION_MINOR 10
 #define QJS_VERSION_PATCH 0
 #define QJS_VERSION_SUFFIX ""
 
diff --git a/lib/monoucha0/monoucha/qjs/xsum.c b/lib/monoucha0/monoucha/qjs/xsum.c
new file mode 100644
index 00000000..98d970b8
--- /dev/null
+++ b/lib/monoucha0/monoucha/qjs/xsum.c
@@ -0,0 +1,1122 @@
+/* FUNCTIONS FOR EXACT SUMMATION. */
+
+/* Copyright 2015, 2018, 2021, 2024 Radford M. Neal
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <string.h>
+#include <math.h>
+#include "xsum.h"
+
+
+/* ---------------------- IMPLEMENTATION ASSUMPTIONS ----------------------- */
+
+/* This code makes the following assumptions:
+
+     o The 'double' type is a IEEE-754 standard 64-bit floating-point value.
+
+     o The 'int64_t' and 'uint64_t' types exist, for 64-bit signed and
+       unsigned integers.
+
+     o The 'endianness' of 'double' and 64-bit integers is consistent
+       between these types - that is, looking at the bits of a 'double'
+       value as an 64-bit integer will have the expected result.
+
+     o Right shifts of a signed operand produce the results expected for
+       a two's complement representation.
+
+     o Rounding should be done in the "round to nearest, ties to even" mode.
+*/
+
+
+/* --------------------------- CONFIGURATION ------------------------------- */
+
+
+/* IMPLEMENTATION OPTIONS.  Can be set to either 0 or 1, whichever seems
+   to be fastest. */
+
+#define USE_SIMD 1          /* Use SIMD intrinsics (SSE2/AVX) if available?   */
+
+#define USE_MEMSET_SMALL 1  /* Use memset rather than a loop (for small mem)? */
+
+#define OPT_SMALL 0         /* Class of manual optimization for operations on */
+                            /*   small accumulator: 0 (none), 1, 2, 3 (SIMD)  */
+#define OPT_CARRY 1         /* Use manually optimized carry propagation?      */
+
+#define INLINE_SMALL 1      /* Inline more of the small accumulator routines? */
+                            /*   (Not currently used)                         */
+
+
+/* INCLUDE INTEL INTRINSICS IF USED AND AVAILABLE. */
+
+#if USE_SIMD && __SSE2__
+# include <immintrin.h>
+#endif
+
+
+/* COPY A 64-BIT QUANTITY - DOUBLE TO 64-BIT INT OR VICE VERSA.  The
+   arguments are destination and source variables (not values). */
+
+#define COPY64(dst,src) memcpy(&(dst),&(src),sizeof(double))
+
+
+/* SET UP DEBUG FLAG.  It's a variable if debuging is enabled, and a
+   constant if disabled (so that no code will be generated then). */
+
+int xsum_debug = 0;
+
+#ifndef DEBUG
+# define xsum_debug 0
+#endif
+
+
+/* SET UP INLINE / NOINLINE MACROS. */
+
+#if __GNUC__
+# define INLINE inline __attribute__ ((always_inline))
+# define NOINLINE __attribute__ ((noinline))
+#else
+# define INLINE inline
+# define NOINLINE
+#endif
+
+
+/* ------------------------ INTERNAL ROUTINES ------------------------------- */
+
+
+/* ADD AN INF OR NAN TO A SMALL ACCUMULATOR.  This only changes the flags,
+   not the chunks in the accumulator, which retains the sum of the finite
+   terms (which is perhaps sometimes useful to access, though no function
+   to do so is defined at present).  A NaN with larger payload (seen as a
+   52-bit unsigned integer) takes precedence, with the sign of the NaN always
+   being positive.  This ensures that the order of summing NaN values doesn't
+   matter. */
+
+static NOINLINE void xsum_small_add_inf_nan
+                       (xsum_small_accumulator *restrict sacc, xsum_int ivalue)
+{
+  xsum_int mantissa;
+  double fltv;
+
+  mantissa = ivalue & XSUM_MANTISSA_MASK;
+
+  if (mantissa == 0) /* Inf */
+  { if (sacc->Inf == 0)
+    { /* no previous Inf */
+      sacc->Inf = ivalue;
+    }
+    else if (sacc->Inf != ivalue)
+    { /* previous Inf was opposite sign */
+      COPY64 (fltv, ivalue);
+      fltv = fltv - fltv;  /* result will be a NaN */
+      COPY64 (sacc->Inf, fltv);
+    }
+  }
+  else /* NaN */
+  { /* Choose the NaN with the bigger payload and clear its sign.  Using <=
+       ensures that we will choose the first NaN over the previous zero. */
+    if ((sacc->NaN & XSUM_MANTISSA_MASK) <= mantissa)
+    { sacc->NaN = ivalue & ~XSUM_SIGN_MASK;
+    }
+  }
+}
+
+
+/* PROPAGATE CARRIES TO NEXT CHUNK IN A SMALL ACCUMULATOR.  Needs to
+   be called often enough that accumulated carries don't overflow out
+   the top, as indicated by sacc->adds_until_propagate.  Returns the
+   index of the uppermost non-zero chunk (0 if number is zero).
+
+   After carry propagation, the uppermost non-zero chunk will indicate
+   the sign of the number, and will not be -1 (all 1s).  It will be in
+   the range -2^XSUM_LOW_MANTISSA_BITS to 2^XSUM_LOW_MANTISSA_BITS - 1.
+   Lower chunks will be non-negative, and in the range from 0 up to
+   2^XSUM_LOW_MANTISSA_BITS - 1. */
+
+static NOINLINE int xsum_carry_propagate (xsum_small_accumulator *restrict sacc)
+{
+  int i, u, uix;
+
+  /* Set u to the index of the uppermost non-zero (for now) chunk, or
+     return with value 0 if there is none. */
+
+# if OPT_CARRY
+
+  { u = XSUM_SCHUNKS-1;
+    switch (XSUM_SCHUNKS & 0x3)   /* get u to be a multiple of 4 minus one  */
+    {
+      case 3: if (sacc->chunk[u] != 0)
+              { goto found2;
+              }
+              u -= 1;                            /* XSUM_SCHUNKS is a */
+      case 2: if (sacc->chunk[u] != 0)           /* constant, so the  */
+              { goto found2;                     /* compiler will do  */
+              }                                  /* simple code here  */
+              u -= 1;
+      case 1: if (sacc->chunk[u] != 0)
+              { goto found2;
+              }
+              u -= 1;
+      case 0: ;
+    }
+
+    do  /* here, u should be a multiple of 4 minus one, and at least 3 */
+    {
+#     if USE_SIMD && __AVX__
+      { __m256i ch;
+        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+u-3));
+        if (!_mm256_testz_si256(ch,ch))
+        { goto found;
+        }
+        u -= 4;
+        if (u < 0)  /* never actually happens, because value of XSUM_SCHUNKS */
+        { break;    /*   is such that u < 0 occurs at end of do loop instead */
+        }
+        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+u-3));
+        if (!_mm256_testz_si256(ch,ch))
+        { goto found;
+        }
+        u -= 4;
+      }
+#     else
+      { if (sacc->chunk[u] | sacc->chunk[u-1]
+          | sacc->chunk[u-2] | sacc->chunk[u-3])
+        { goto found;
+        }
+        u -= 4;
+      }
+#     endif
+
+    } while (u >= 0);
+
+    uix = 0;
+    goto done;
+
+  found:
+    if (sacc->chunk[u] != 0)
+    { goto found2;
+    }
+    u -= 1;
+    if (sacc->chunk[u] != 0)
+    { goto found2;
+    }
+    u -= 1;
+    if (sacc->chunk[u] != 0)
+    { goto found2;
+    }
+    u -= 1;
+
+   found2: ;
+  }
+
+# else  /* Non-optimized search for uppermost non-zero chunk */
+
+  { for (u = XSUM_SCHUNKS-1; sacc->chunk[u] == 0; u--)
+    { if (u == 0)
+      {
+        uix = 0;
+        goto done;
+      }
+    }
+  }
+
+# endif
+
+  /* At this point, sacc->chunk[u] must be non-zero */
+
+  /* Carry propagate, starting at the low-order chunks.  Note that the
+     loop limit of u may be increased inside the loop. */
+
+  i = 0;     /* set to the index of the next non-zero chunck, from bottom */
+
+# if OPT_CARRY
+  {
+    /* Quickly skip over unused low-order chunks.  Done here at the start
+       on the theory that there are often many unused low-order chunks,
+       justifying some overhead to begin, but later stretches of unused
+       chunks may not be as large. */
+
+    int e = u-3;  /* go only to 3 before so won't access beyond chunk array */
+
+    do
+    {
+#     if USE_SIMD && __AVX__
+      { __m256i ch;
+        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+i));
+        if (!_mm256_testz_si256(ch,ch))
+        { break;
+        }
+        i += 4;
+        if (i >= e)
+        { break;
+        }
+        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+i));
+        if (!_mm256_testz_si256(ch,ch))
+        { break;
+        }
+      }
+#     else
+      { if (sacc->chunk[i] | sacc->chunk[i+1]
+          | sacc->chunk[i+2] | sacc->chunk[i+3])
+        { break;
+        }
+      }
+#     endif
+
+      i += 4;
+
+    } while (i <= e);
+  }
+# endif
+
+  uix = -1;  /* indicates that a non-zero chunk has not been found yet */
+
+  do
+  { xsum_schunk c;       /* Set to the chunk at index i (next non-zero one) */
+    xsum_schunk clow;    /* Low-order bits of c */
+    xsum_schunk chigh;   /* High-order bits of c */
+
+    /* Find the next non-zero chunk, setting i to its index, or break out
+       of loop if there is none.  Note that the chunk at index u is not
+       necessarily non-zero - it was initially, but u or the chunk at u
+       may have changed. */
+
+#   if OPT_CARRY
+    {
+      c = sacc->chunk[i];
+      if (c != 0)
+      { goto nonzero;
+      }
+      i += 1;
+      if (i > u)
+      { break;  /* reaching here is only possible when u == i initially, */
+      }         /*   with the last add to a chunk having changed it to 0 */
+
+      for (;;)
+      { c = sacc->chunk[i];
+        if (c != 0)
+        { goto nonzero;
+        }
+        i += 1;
+        c = sacc->chunk[i];
+        if (c != 0)
+        { goto nonzero;
+        }
+        i += 1;
+        c = sacc->chunk[i];
+        if (c != 0)
+        { goto nonzero;
+        }
+        i += 1;
+        c = sacc->chunk[i];
+        if (c != 0)
+        { goto nonzero;
+        }
+        i += 1;
+      }
+    }
+#   else
+    {
+      do
+      { c = sacc->chunk[i];
+        if (c != 0)
+        { goto nonzero;
+        }
+        i += 1;
+      } while (i <= u);
+
+      break;
+    }
+#   endif
+
+    /* Propagate possible carry from this chunk to next chunk up. */
+
+  nonzero:
+    chigh = c >> XSUM_LOW_MANTISSA_BITS;
+    if (chigh == 0)
+    { uix = i;
+      i += 1;
+      continue;  /* no need to change this chunk */
+    }
+
+    if (u == i)
+    { if (chigh == -1)
+      { uix = i;
+        break;   /* don't propagate -1 into the region of all zeros above */
+      }
+      u = i+1;   /* we will change chunk[u+1], so we'll need to look at it */
+    }
+
+    clow = c & XSUM_LOW_MANTISSA_MASK;
+    if (clow != 0)
+    { uix = i;
+    }
+
+    /* We now change chunk[i] and add to chunk[i+1]. Note that i+1 should be
+       in range (no bigger than XSUM_CHUNKS-1) if summing memory, since
+       the number of chunks is big enough to hold any sum, and we do not
+       store redundant chunks with values 0 or -1 above previously non-zero
+       chunks.  But other add operations might cause overflow, in which
+       case we produce a NaN with all 1s as payload.  (We can't reliably produce
+       an Inf of the right sign.) */
+
+    sacc->chunk[i] = clow;
+    if (i+1 >= XSUM_SCHUNKS)
+    { xsum_small_add_inf_nan (sacc,
+        ((xsum_int)XSUM_EXP_MASK << XSUM_MANTISSA_BITS) | XSUM_MANTISSA_MASK);
+      u = i;
+    }
+    else
+    { sacc->chunk[i+1] += chigh;  /* note: this could make this chunk be zero */
+    }
+
+    i += 1;
+
+  } while (i <= u);
+
+  /* Check again for the number being zero, since carry propagation might
+     have created zero from something that initially looked non-zero. */
+
+  if (uix < 0)
+  {
+    uix = 0;
+    goto done;
+  }
+
+  /* While the uppermost chunk is negative, with value -1, combine it with
+     the chunk below (if there is one) to produce the same number but with
+     one fewer non-zero chunks. */
+
+  while (sacc->chunk[uix] == -1 && uix > 0)
+  { /* Left shift of a negative number is undefined according to the standard,
+       so do a multiply - it's all presumably constant-folded by the compiler.*/
+    sacc->chunk[uix-1] += ((xsum_schunk) -1)
+                             * (((xsum_schunk) 1) << XSUM_LOW_MANTISSA_BITS);
+    sacc->chunk[uix] = 0;
+    uix -= 1;
+  }
+
+  /* We can now add one less than the total allowed terms before the
+     next carry propagate. */
+
+done:
+  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS-1;
+
+  /* Return index of uppermost non-zero chunk. */
+
+  return uix;
+}
+
+
+/* ------------------------ EXTERNAL ROUTINES ------------------------------- */
+
+
+/* INITIALIZE A SMALL ACCUMULATOR TO ZERO. */
+
+void xsum_small_init (xsum_small_accumulator *restrict sacc)
+{
+  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS;
+  sacc->Inf = sacc->NaN = 0;
+# if USE_MEMSET_SMALL
+  { memset (sacc->chunk, 0, XSUM_SCHUNKS * sizeof(xsum_schunk));
+  }
+# elif USE_SIMD && __AVX__ && XSUM_SCHUNKS==67
+  { xsum_schunk *ch = sacc->chunk;
+    __m256i z = _mm256_setzero_si256();
+    _mm256_storeu_si256 ((__m256i *)(ch+0), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+4), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+8), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+12), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+16), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+20), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+24), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+28), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+32), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+36), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+40), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+44), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+48), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+52), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+56), z);
+    _mm256_storeu_si256 ((__m256i *)(ch+60), z);
+    _mm_storeu_si128    ((__m128i *)(ch+64), _mm256_castsi256_si128(z));
+    _mm_storeu_si64     (ch+66, _mm256_castsi256_si128(z));
+  }
+# else
+  { xsum_schunk *p;
+    int n;
+    p = sacc->chunk;
+    n = XSUM_SCHUNKS;
+    do { *p++ = 0; n -= 1; } while (n > 0);
+  }
+# endif
+}
+
+
+/* ADD ONE NUMBER TO A SMALL ACCUMULATOR ASSUMING NO CARRY PROPAGATION REQ'D.
+   This function is declared INLINE regardless of the setting of INLINE_SMALL
+   and for good performance it must be inlined by the compiler (otherwise the
+   procedure call overhead will result in substantial inefficiency). */
+
+static INLINE void xsum_add1_no_carry (xsum_small_accumulator *restrict sacc,
+                                       xsum_flt value)
+{
+  xsum_int ivalue;
+  xsum_int mantissa;
+  xsum_expint exp, low_exp, high_exp;
+  xsum_schunk *chunk_ptr;
+
+  /* Extract exponent and mantissa.  Split exponent into high and low parts. */
+
+  COPY64 (ivalue, value);
+
+  exp = (ivalue >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK;
+  mantissa = ivalue & XSUM_MANTISSA_MASK;
+  high_exp = exp >> XSUM_LOW_EXP_BITS;
+  low_exp = exp & XSUM_LOW_EXP_MASK;
+
+  /* Categorize number as normal, denormalized, or Inf/NaN according to
+     the value of the exponent field. */
+
+  if (exp == 0) /* zero or denormalized */
+  { /* If it's a zero (positive or negative), we do nothing. */
+    if (mantissa == 0)
+    { return;
+    }
+    /* Denormalized mantissa has no implicit 1, but exponent is 1 not 0. */
+    exp = low_exp = 1;
+  }
+  else if (exp == XSUM_EXP_MASK)  /* Inf or NaN */
+  { /* Just update flags in accumulator structure. */
+    xsum_small_add_inf_nan (sacc, ivalue);
+    return;
+  }
+  else /* normalized */
+  { /* OR in implicit 1 bit at top of mantissa */
+    mantissa |= (xsum_int)1 << XSUM_MANTISSA_BITS;
+  }
+
+  /* Use high part of exponent as index of chunk, and low part of
+     exponent to give position within chunk.  Fetch the two chunks
+     that will be modified. */
+
+  chunk_ptr = sacc->chunk + high_exp;
+
+  /* Separate mantissa into two parts, after shifting, and add to (or
+     subtract from) this chunk and the next higher chunk (which always
+     exists since there are three extra ones at the top).
+
+     Note that low_mantissa will have at most XSUM_LOW_MANTISSA_BITS bits,
+     while high_mantissa will have at most XSUM_MANTISSA_BITS bits, since
+     even though the high mantissa includes the extra implicit 1 bit, it will
+     also be shifted right by at least one bit. */
+
+  xsum_int split_mantissa[2];
+  split_mantissa[0] = ((xsum_uint)mantissa << low_exp) & XSUM_LOW_MANTISSA_MASK;
+  split_mantissa[1] = mantissa >> (XSUM_LOW_MANTISSA_BITS - low_exp);
+
+  /* Add to, or subtract from, the two affected chunks. */
+
+# if OPT_SMALL==1
+  { xsum_int ivalue_sign = ivalue<0 ? -1 : 1;
+    chunk_ptr[0] += ivalue_sign * split_mantissa[0];
+    chunk_ptr[1] += ivalue_sign * split_mantissa[1];
+  }
+# elif OPT_SMALL==2
+  { xsum_int ivalue_neg
+              = ivalue>>(XSUM_SCHUNK_BITS-1); /* all 0s if +ve, all 1s if -ve */
+    chunk_ptr[0] += (split_mantissa[0] ^ ivalue_neg) + (ivalue_neg & 1);
+    chunk_ptr[1] += (split_mantissa[1] ^ ivalue_neg) + (ivalue_neg & 1);
+  }
+# elif OPT_SMALL==3 && USE_SIMD && __SSE2__
+  { xsum_int ivalue_neg
+              = ivalue>>(XSUM_SCHUNK_BITS-1); /* all 0s if +ve, all 1s if -ve */
+    _mm_storeu_si128 ((__m128i *)chunk_ptr,
+                      _mm_add_epi64 (_mm_loadu_si128 ((__m128i *)chunk_ptr),
+                       _mm_add_epi64 (_mm_set1_epi64((__m64)(ivalue_neg&1)),
+                        _mm_xor_si128 (_mm_set1_epi64((__m64)ivalue_neg),
+                         _mm_loadu_si128 ((__m128i *)split_mantissa)))));
+  }
+# else
+  { if (ivalue < 0)
+    { chunk_ptr[0] -= split_mantissa[0];
+      chunk_ptr[1] -= split_mantissa[1];
+    }
+    else
+    { chunk_ptr[0] += split_mantissa[0];
+      chunk_ptr[1] += split_mantissa[1];
+    }
+  }
+# endif
+}
+
+
+/* ADD ONE DOUBLE TO A SMALL ACCUMULATOR.  This is equivalent to, but
+   somewhat faster than, calling xsum_small_addv with a vector of one
+   value. */
+
+void xsum_small_add1 (xsum_small_accumulator *restrict sacc, xsum_flt value)
+{
+  if (sacc->adds_until_propagate == 0)
+  { (void) xsum_carry_propagate(sacc);
+  }
+
+  xsum_add1_no_carry (sacc, value);
+
+  sacc->adds_until_propagate -= 1;
+}
+
+
+/* ADD A VECTOR OF FLOATING-POINT NUMBERS TO A SMALL ACCUMULATOR.  Mixes
+   calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
+
+void xsum_small_addv (xsum_small_accumulator *restrict sacc,
+                      const xsum_flt *restrict vec,
+                      xsum_length n)
+{ xsum_length m, i;
+
+  while (n > 0)
+  { if (sacc->adds_until_propagate == 0)
+    { (void) xsum_carry_propagate(sacc);
+    }
+    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
+    for (i = 0; i < m; i++)
+    { xsum_add1_no_carry (sacc, vec[i]);
+    }
+    sacc->adds_until_propagate -= m;
+    vec += m;
+    n -= m;
+  }
+}
+
+
+/* ADD SQUARED NORM OF VECTOR OF FLOATING-POINT NUMBERS TO SMALL ACCUMULATOR.
+   Mixes calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
+
+void xsum_small_add_sqnorm (xsum_small_accumulator *restrict sacc,
+                            const xsum_flt *restrict vec,
+                            xsum_length n)
+{ xsum_length m, i;
+
+  while (n > 0)
+  { if (sacc->adds_until_propagate == 0)
+    { (void) xsum_carry_propagate(sacc);
+    }
+    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
+    for (i = 0; i < m; i++)
+    { xsum_add1_no_carry (sacc, vec[i] * vec[i]);
+    }
+    sacc->adds_until_propagate -= m;
+    vec += m;
+    n -= m;
+  }
+}
+
+
+/* ADD DOT PRODUCT OF VECTORS OF FLOATING-POINT NUMBERS TO SMALL ACCUMULATOR.
+   Mixes calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
+
+void xsum_small_add_dot (xsum_small_accumulator *restrict sacc,
+                         const xsum_flt *vec1, const xsum_flt *vec2,
+                         xsum_length n)
+{ xsum_length m, i;
+
+  while (n > 0)
+  { if (sacc->adds_until_propagate == 0)
+    { (void) xsum_carry_propagate(sacc);
+    }
+    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
+    for (i = 0; i < m; i++)
+    { xsum_add1_no_carry (sacc, vec1[i] * vec2[i]);
+    }
+    sacc->adds_until_propagate -= m;
+    vec1 += m;
+    vec2 += m;
+    n -= m;
+  }
+}
+
+
+/* ADD A SMALL ACCUMULATOR TO ANOTHER SMALL ACCUMULATOR.  The first argument
+   is the destination, which is modified.  The second is the accumulator to
+   add, which may also be modified, but should still represent the same
+   number.  Source and destination may be the same. */
+
+void xsum_small_add_accumulator (xsum_small_accumulator *dst_sacc,
+                                 xsum_small_accumulator *src_sacc)
+{
+  int i;
+
+  xsum_carry_propagate (dst_sacc);
+
+  if (dst_sacc == src_sacc)
+  { for (i = 0; i < XSUM_SCHUNKS; i++)
+    { dst_sacc->chunk[i] += dst_sacc->chunk[i];
+    }
+  }
+  else
+  {
+    xsum_carry_propagate (src_sacc);
+
+    if (src_sacc->Inf) xsum_small_add_inf_nan (dst_sacc, src_sacc->Inf);
+    if (src_sacc->NaN) xsum_small_add_inf_nan (dst_sacc, src_sacc->NaN);
+
+    for (i = 0; i < XSUM_SCHUNKS; i++)
+    { dst_sacc->chunk[i] += src_sacc->chunk[i];
+    }
+  }
+
+  dst_sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS-2;
+}
+
+
+/* NEGATE THE VALUE IN A SMALL ACCUMULATOR. */
+
+void xsum_small_negate (xsum_small_accumulator *restrict sacc)
+{
+  int i;
+
+  for (i = 0; i < XSUM_SCHUNKS; i++)
+  { sacc->chunk[i] = -sacc->chunk[i];
+  }
+
+  if (sacc->Inf != 0)
+  { sacc->Inf ^= XSUM_SIGN_MASK;
+  }
+}
+
+
+/* RETURN THE RESULT OF ROUNDING A SMALL ACCUMULATOR.  The rounding mode
+   is to nearest, with ties to even.  The small accumulator may be modified
+   by this operation (by carry propagation being done), but the value it
+   represents should not change. */
+
+xsum_flt xsum_small_round (xsum_small_accumulator *restrict sacc)
+{
+  xsum_int ivalue;
+  xsum_schunk lower;
+  int i, j, e, more;
+  xsum_int intv;
+  double fltv;
+
+  /* See if we have a NaN from one of the numbers being a NaN, in
+     which case we return the NaN with largest payload, or an infinite
+     result (+Inf, -Inf, or a NaN if both +Inf and -Inf occurred).
+     Note that we do NOT return NaN if we have both an infinite number
+     and a sum of other numbers that overflows with opposite sign,
+     since there is no real ambiguity regarding the sign in such a case. */
+
+  if (sacc->NaN != 0)
+  { COPY64(fltv, sacc->NaN);
+    return fltv;
+  }
+
+  if (sacc->Inf != 0)
+  { COPY64 (fltv, sacc->Inf);
+    return fltv;
+  }
+
+  /* If none of the numbers summed were infinite or NaN, we proceed to
+     propagate carries, as a preliminary to finding the magnitude of
+     the sum.  This also ensures that the sign of the result can be
+     determined from the uppermost non-zero chunk.
+
+     We also find the index, i, of this uppermost non-zero chunk, as
+     the value returned by xsum_carry_propagate, and set ivalue to
+     sacc->chunk[i].  Note that ivalue will not be 0 or -1, unless
+     i is 0 (the lowest chunk), in which case it will be handled by
+     the code for denormalized numbers. */
+
+  i = xsum_carry_propagate(sacc);
+
+  ivalue = sacc->chunk[i];
+
+  /* Handle a possible denormalized number, including zero. */
+
+  if (i <= 1)
+  {
+    /* Check for zero value, in which case we can return immediately. */
+
+    if (ivalue == 0)
+    { return 0.0;
+    }
+
+    /* Check if it is actually a denormalized number.  It always is if only
+       the lowest chunk is non-zero.  If the highest non-zero chunk is the
+       next-to-lowest, we check the magnitude of the absolute value.
+       Note that the real exponent is 1 (not 0), so we need to shift right
+       by 1 here. */
+
+    if (i == 0)
+    { intv = ivalue >= 0 ? ivalue : -ivalue;
+      intv >>= 1;
+      if (ivalue < 0)
+      { intv |= XSUM_SIGN_MASK;
+      }
+      COPY64 (fltv, intv);
+      return fltv;
+    }
+    else
+    { /* Note: Left shift of -ve number is undefined, so do a multiply instead,
+               which is probably optimized to a shift. */
+      intv = ivalue * ((xsum_int)1 << (XSUM_LOW_MANTISSA_BITS-1))
+               + (sacc->chunk[0] >> 1);
+      if (intv < 0)
+      { if (intv > - ((xsum_int)1 << XSUM_MANTISSA_BITS))
+        { intv = (-intv) | XSUM_SIGN_MASK;
+          COPY64 (fltv, intv);
+          return fltv;
+        }
+      }
+      else /* non-negative */
+      { if ((xsum_uint)intv < (xsum_uint)1 << XSUM_MANTISSA_BITS)
+        {
+          COPY64 (fltv, intv);
+          return fltv;
+        }
+      }
+      /* otherwise, it's not actually denormalized, so fall through to below */
+    }
+  }
+
+  /* Find the location of the uppermost 1 bit in the absolute value of
+     the upper chunk by converting it (as a signed integer) to a
+     floating point value, and looking at the exponent.  Then set
+     'more' to the number of bits from the lower chunk (and maybe the
+     next lower) that are needed to fill out the mantissa of the
+     result (including the top implicit 1 bit), plus two extra bits to
+     help decide on rounding.  For negative numbers, it may turn out
+     later that we need another bit, because negating a negative value
+     may carry out of the top here, but not carry out of the top once
+     more bits are shifted into the bottom later on. */
+
+  fltv = (xsum_flt) ivalue;  /* finds position of topmost 1 bit of |ivalue| */
+  COPY64 (intv, fltv);
+  e = (intv >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK; /* e-bias is in 0..32 */
+  more = 2 + XSUM_MANTISSA_BITS + XSUM_EXP_BIAS - e;
+
+  /* Change 'ivalue' to put in 'more' bits from lower chunks into the bottom.
+     Also set 'j' to the index of the lowest chunk from which these bits came,
+     and 'lower' to the remaining bits of that chunk not now in 'ivalue'.
+     Note that 'lower' initially has at least one bit in it, which we can
+     later move into 'ivalue' if it turns out that one more bit is needed. */
+
+  ivalue *= (xsum_int)1 << more;  /* multiply, since << of negative undefined */
+
+  j = i-1;
+  lower = sacc->chunk[j];  /* must exist, since denormalized if i==0 */
+  if (more >= XSUM_LOW_MANTISSA_BITS)
+  { more -= XSUM_LOW_MANTISSA_BITS;
+    ivalue += lower << more;
+    j -= 1;
+    lower = j < 0 ? 0 : sacc->chunk[j];
+  }
+  ivalue += lower >> (XSUM_LOW_MANTISSA_BITS - more);
+  lower &= ((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1;
+
+  /* Decide on rounding, with separate code for positive and negative values.
+
+     At this point, 'ivalue' has the signed mantissa bits, plus two extra
+     bits, with 'e' recording the exponent position for these within their
+     top chunk.  For positive 'ivalue', the bits in 'lower' and chunks
+     below 'j' add to the absolute value; for negative 'ivalue' they
+     subtract.
+
+     After setting 'ivalue' to the tentative unsigned mantissa
+     (shifted left 2), and 'intv' to have the correct sign, this
+     code goes to done_rounding if it finds that just discarding lower
+     order bits is correct, and to round_away_from_zero if instead the
+     magnitude should be increased by one in the lowest mantissa bit. */
+
+  if (ivalue >= 0)  /* number is positive, lower bits are added to magnitude */
+  {
+    intv = 0;  /* positive sign */
+
+    if ((ivalue & 2) == 0)  /* extra bits are 0x */
+    {
+      goto done_rounding;
+    }
+
+    if ((ivalue & 1) != 0)  /* extra bits are 11 */
+    {
+      goto round_away_from_zero;
+    }
+
+    if ((ivalue & 4) != 0)  /* low bit is 1 (odd), extra bits are 10 */
+    {
+      goto round_away_from_zero;
+    }
+
+    if (lower == 0)  /* see if any lower bits are non-zero */
+    { while (j > 0)
+      { j -= 1;
+        if (sacc->chunk[j] != 0)
+        { lower = 1;
+          break;
+        }
+      }
+    }
+
+    if (lower != 0)  /* low bit 0 (even), extra bits 10, non-zero lower bits */
+    {
+      goto round_away_from_zero;
+    }
+    else  /* low bit 0 (even), extra bits 10, all lower bits 0 */
+    {
+      goto done_rounding;
+    }
+  }
+
+  else  /* number is negative, lower bits are subtracted from magnitude */
+  {
+    /* Check for a negative 'ivalue' that when negated doesn't contain a full
+       mantissa's worth of bits, plus one to help rounding.  If so, move one
+       more bit into 'ivalue' from 'lower' (and remove it from 'lower').
+       This happens when the negation of the upper part of 'ivalue' has the
+       form 10000... but the negation of the full 'ivalue' is not 10000... */
+
+    if (((-ivalue) & ((xsum_int)1 << (XSUM_MANTISSA_BITS+2))) == 0)
+    { int pos = (xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - 1 - more);
+      ivalue *= 2;  /* note that left shift undefined if ivalue is negative */
+      if (lower & pos)
+      { ivalue += 1;
+        lower &= ~pos;
+      }
+      e -= 1;
+    }
+
+    intv = XSUM_SIGN_MASK;    /* negative sign */
+    ivalue = -ivalue;         /* ivalue now contains the absolute value */
+
+    if ((ivalue & 3) == 3)  /* extra bits are 11 */
+    {
+      goto round_away_from_zero;
+    }
+
+    if ((ivalue & 3) <= 1)  /* extra bits are 00 or 01 */
+    {
+      goto done_rounding;
+    }
+
+    if ((ivalue & 4) == 0)  /* low bit is 0 (even), extra bits are 10 */
+    {
+      goto done_rounding;
+    }
+
+    if (lower == 0)  /* see if any lower bits are non-zero */
+    { while (j > 0)
+      { j -= 1;
+        if (sacc->chunk[j] != 0)
+        { lower = 1;
+          break;
+        }
+      }
+    }
+
+    if (lower != 0)  /* low bit 1 (odd), extra bits 10, non-zero lower bits */
+    {
+      goto done_rounding;
+    }
+    else  /* low bit 1 (odd), extra bits are 10, lower bits are all 0 */
+    {
+      goto round_away_from_zero;
+    }
+
+  }
+
+round_away_from_zero:
+
+  /* Round away from zero, then check for carry having propagated out the
+     top, and shift if so. */
+
+  ivalue += 4;  /* add 1 to low-order mantissa bit */
+  if (ivalue & ((xsum_int)1 << (XSUM_MANTISSA_BITS+3)))
+  { ivalue >>= 1;
+    e += 1;
+  }
+
+done_rounding: ;
+
+  /* Get rid of the bottom 2 bits that were used to decide on rounding. */
+
+  ivalue >>= 2;
+
+  /* Adjust to the true exponent, accounting for where this chunk is. */
+
+  e += (i<<XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS;
+
+  /* If exponent has overflowed, change to plus or minus Inf and return. */
+
+  if (e >= XSUM_EXP_MASK)
+  { intv |= (xsum_int) XSUM_EXP_MASK << XSUM_MANTISSA_BITS;
+    COPY64 (fltv, intv);
+
+    return fltv;
+  }
+
+  /* Put exponent and mantissa into intv, which already has the sign,
+     then copy into fltv. */
+
+  intv += (xsum_int)e << XSUM_MANTISSA_BITS;
+  intv += ivalue & XSUM_MANTISSA_MASK;  /* mask out the implicit 1 bit */
+  COPY64 (fltv, intv);
+
+  if (xsum_debug)
+  {
+    if ((ivalue >> XSUM_MANTISSA_BITS) != 1) abort();
+  }
+
+  return fltv;
+}
+
+
+/* FIND RESULT OF DIVIDING SMALL ACCUMULATOR BY UNSIGNED INTEGER. */
+
+xsum_flt xsum_small_div_unsigned
+           (xsum_small_accumulator *restrict sacc, unsigned div)
+{
+  xsum_flt result;
+  unsigned rem;
+  double fltv;
+  int sign;
+  int i, j;
+
+  /* Return NaN or an Inf if that's what's in the superaccumulator. */
+
+  if (sacc->NaN != 0)
+  { COPY64(fltv, sacc->NaN);
+    return fltv;
+  }
+
+  if (sacc->Inf != 0)
+  { COPY64 (fltv, sacc->Inf);
+    return fltv;
+  }
+
+  /* Make a copy of the superaccumulator, so we can change it here without
+     changing *sacc. */
+
+  xsum_small_accumulator tacc = *sacc;
+
+  /* Carry propagate in the temporary copy of the superaccumulator.
+     Sets 'i' to the index of the topmost nonzero chunk. */
+
+  i = xsum_carry_propagate(&tacc);
+
+  /* Check for division by zero, and if so, return +Inf, -Inf, or NaN,
+     depending on whether the superaccumulator is positive, negative,
+     or zero. */
+
+  if (div == 0)
+  {
+    return tacc.chunk[i] > 0 ? INFINITY : tacc.chunk[i] < 0 ? -INFINITY : NAN;
+  }
+
+  /* Record sign of accumulator, and if it's negative, negate and
+     re-propagate so that it will be positive. */
+
+  sign = +1;
+
+  if (tacc.chunk[i] < 0)
+  { xsum_small_negate(&tacc);
+    i = xsum_carry_propagate(&tacc);
+    if (xsum_debug)
+    {
+      if (tacc.chunk[i] < 0) abort();
+    }
+    sign = -1;
+  }
+
+  /* Do the division in the small accumulator, putting the remainder after
+     dividing the bottom chunk in 'rem'. */
+
+  rem = 0;
+  for (j = i; j>=0; j--)
+  { xsum_uint num = ((xsum_uint) rem << XSUM_LOW_MANTISSA_BITS) + tacc.chunk[j];
+    xsum_uint quo = num / div;
+    rem = num - quo*div;
+    tacc.chunk[j] = quo;
+  }
+
+  /* Find new top chunk. */
+
+  while (i > 0 && tacc.chunk[i] == 0)
+  { i -= 1;
+  }
+
+  /* Do rounding, with separate approachs for a normal number with biased
+     exponent greater than 1, and for a normal number with exponent of 1
+     or a denormalized number (also having true biased exponent of 1). */
+
+  if (i > 1 || tacc.chunk[1] >= (1 << (XSUM_HIGH_MANTISSA_BITS+2)))
+  {
+    /* Normalized number with at least two bits at bottom of chunk 0
+       below the mantissa.  Just need to 'or' in a 1 at the bottom if
+       remainder is non-zero to break a tie if bits below bottom of
+       mantissa are exactly 1/2. */
+
+    if (rem > 0)
+    { tacc.chunk[0] |= 1;
+    }
+  }
+  else
+  {
+    /* Denormalized number or normal number with biased exponent of 1.
+       Lowest bit of bottom chunk is just below lowest bit of
+       mantissa.  Need to explicitly round here using the bottom bit
+       and the remainder - round up if lower > 1/2 or >= 1/2 and
+       odd. */
+
+    if (tacc.chunk[0] & 1)  /* lower part is >= 1/2 */
+    {
+      if (tacc.chunk[0] & 2)  /* lowest bit of mantissa is 1 (odd) */
+      { tacc.chunk[0] += 2;     /* round up */
+      }
+      else                    /* lowest bit of mantissa is 0 (even) */
+      { if (rem > 0)            /* lower part is > 1/2 */
+        { tacc.chunk[0] += 2;     /* round up */
+        }
+      }
+
+      tacc.chunk[0] &= ~1;  /* clear low bit (but should anyway be ignored) */
+    }
+  }
+
+  /* Do the final rounding, with the lowest bit set as above. */
+
+  result = xsum_small_round (&tacc);
+
+  return sign*result;
+}
+
+
+/* FIND RESULT OF DIVIDING SMALL ACCUMULATOR BY SIGNED INTEGER. */
+
+xsum_flt xsum_small_div_int
+           (xsum_small_accumulator *restrict sacc, int div)
+{
+  if (div < 0)
+  { return -xsum_small_div_unsigned (sacc, (unsigned) -div);
+  }
+  else
+  { return xsum_small_div_unsigned (sacc, (unsigned) div);
+  }
+}
diff --git a/lib/monoucha0/monoucha/qjs/xsum.h b/lib/monoucha0/monoucha/qjs/xsum.h
new file mode 100644
index 00000000..2372cac6
--- /dev/null
+++ b/lib/monoucha0/monoucha/qjs/xsum.h
@@ -0,0 +1,133 @@
+/* INTERFACE TO FUNCTIONS FOR EXACT SUMMATION. */
+
+/* Copyright 2015, 2018, 2021 Radford M. Neal
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef XSUM_H
+#define XSUM_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+
+
+/* CONSTANTS DEFINING THE FLOATING POINT FORMAT. */
+
+typedef double xsum_flt;           /* C floating point type sums are done for */
+
+typedef int64_t xsum_int;          /* Signed integer type for a fp value */
+typedef uint64_t xsum_uint;        /* Unsigned integer type for a fp value */
+typedef int_fast16_t xsum_expint;  /* Integer type for holding an exponent */
+
+#define XSUM_MANTISSA_BITS 52      /* Bits in fp mantissa, excludes implict 1 */
+#define XSUM_EXP_BITS 11           /* Bits in fp exponent */
+
+#define XSUM_MANTISSA_MASK \
+  (((xsum_int)1 << XSUM_MANTISSA_BITS) - 1)  /* Mask for mantissa bits */
+
+#define XSUM_EXP_MASK \
+  ((1 << XSUM_EXP_BITS) - 1)                 /* Mask for exponent */
+
+#define XSUM_EXP_BIAS \
+  ((1 << (XSUM_EXP_BITS-1)) - 1)             /* Bias added to signed exponent */
+
+#define XSUM_SIGN_BIT \
+  (XSUM_MANTISSA_BITS + XSUM_EXP_BITS)       /* Position of sign bit */
+
+#define XSUM_SIGN_MASK \
+  ((xsum_uint)1 << XSUM_SIGN_BIT)            /* Mask for sign bit */
+
+
+/* CONSTANTS DEFINING THE SMALL ACCUMULATOR FORMAT. */
+
+#define XSUM_SCHUNK_BITS 64        /* Bits in chunk of the small accumulator */
+typedef int64_t xsum_schunk;       /* Integer type of small accumulator chunk */
+
+#define XSUM_LOW_EXP_BITS 5        /* # of low bits of exponent, in one chunk */
+
+#define XSUM_LOW_EXP_MASK \
+  ((1 << XSUM_LOW_EXP_BITS) - 1)       /* Mask for low-order exponent bits */
+
+#define XSUM_HIGH_EXP_BITS \
+  (XSUM_EXP_BITS - XSUM_LOW_EXP_BITS)  /* # of high exponent bits for index */
+
+#define XSUM_HIGH_EXP_MASK \
+  ((1 << HIGH_EXP_BITS) - 1)           /* Mask for high-order exponent bits */
+
+#define XSUM_SCHUNKS \
+  ((1 << XSUM_HIGH_EXP_BITS) + 3)      /* # of chunks in small accumulator */
+
+#define XSUM_LOW_MANTISSA_BITS \
+  (1 << XSUM_LOW_EXP_BITS)             /* Bits in low part of mantissa */
+
+#define XSUM_HIGH_MANTISSA_BITS \
+  (XSUM_MANTISSA_BITS - XSUM_LOW_MANTISSA_BITS)  /* Bits in high part */
+
+#define XSUM_LOW_MANTISSA_MASK \
+  (((xsum_int)1 << XSUM_LOW_MANTISSA_BITS) - 1)  /* Mask for low bits */
+
+#define XSUM_SMALL_CARRY_BITS \
+ ((XSUM_SCHUNK_BITS-1) - XSUM_MANTISSA_BITS)     /* Bits sums can carry into */
+
+#define XSUM_SMALL_CARRY_TERMS \
+  ((1 << XSUM_SMALL_CARRY_BITS) - 1)   /* # terms can add before need prop. */
+
+typedef struct
+{ xsum_schunk chunk[XSUM_SCHUNKS]; /* Chunks making up small accumulator */
+  xsum_int Inf;                    /* If non-zero, +Inf, -Inf, or NaN */
+  xsum_int NaN;                    /* If non-zero, a NaN value with payload */
+  int adds_until_propagate;        /* Number of remaining adds before carry */
+} xsum_small_accumulator;          /*     propagation must be done again    */
+
+
+/* TYPE FOR LENGTHS OF ARRAYS.  Must be a signed integer type.  Set to
+   ptrdiff_t here on the assumption that this will be big enough, but
+   not unnecessarily big, which seems to be true. */
+
+typedef ptrdiff_t xsum_length;
+
+
+/* FUNCTIONS FOR EXACT SUMMATION, WITH POSSIBLE DIVISION BY AN INTEGER. */
+
+void xsum_small_init (xsum_small_accumulator *restrict);
+void xsum_small_add1 (xsum_small_accumulator *restrict, xsum_flt);
+void xsum_small_addv (xsum_small_accumulator *restrict,
+                      const xsum_flt *restrict, xsum_length);
+void xsum_small_add_sqnorm (xsum_small_accumulator *restrict,
+                            const xsum_flt *restrict, xsum_length);
+void xsum_small_add_dot (xsum_small_accumulator *restrict,
+                         const xsum_flt *, const xsum_flt *, xsum_length);
+void xsum_small_add_accumulator (xsum_small_accumulator *,
+                                 xsum_small_accumulator *);
+void xsum_small_negate (xsum_small_accumulator *restrict);
+xsum_flt xsum_small_round (xsum_small_accumulator *restrict);
+
+xsum_flt xsum_small_div_unsigned (xsum_small_accumulator *restrict, unsigned);
+xsum_flt xsum_small_div_int (xsum_small_accumulator *restrict, int);
+
+
+/* DEBUG FLAG.  Set to non-zero for debug ouptut.  Ignored unless xsum.c
+   is compiled with -DDEBUG. */
+
+extern int xsum_debug;
+
+#endif
diff --git a/lib/monoucha0/monoucha/quickjs.nim b/lib/monoucha0/monoucha/quickjs.nim
index 2154b42a..4ffefae7 100644
--- a/lib/monoucha0/monoucha/quickjs.nim
+++ b/lib/monoucha0/monoucha/quickjs.nim
@@ -20,7 +20,7 @@ else:
   {.passl: "-lpthread".}
 
 {.compile("qjs/quickjs.c", CFLAGS).}
-{.compile("qjs/libbf.c", CFLAGS).}
+{.compile("qjs/xsum.c", CFLAGS).}
 
 {.passc: "-I" & currentSourcePath().parentDir().}
 
diff --git a/res/license.md b/res/license.md
index 23e8fbc4..5efcb054 100644
--- a/res/license.md
+++ b/res/license.md
@@ -26,6 +26,7 @@ Table of contents:
 * [JebP](#jebp)
 * [NanoSVG](#nanosvg)
 * [QuickJS-NG](#quickjs-ng)
+	* [xsum](#xsum)
 * [Punycode library](#punycode-library)
 * [GNU Unifont](#gnu-unifont)
 
@@ -161,6 +162,33 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 ```
 
+### xsum
+
+QuickJS-NG also includes xsum.c by Radford M. Neal for Math.sumPrecise.
+
+```
+Copyright 2015, 2018, 2021, 2024 Radford M. Neal
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+```
+
 ## Punycode library
 
 We vendor the punycode library, which is no longer included in the Nim
author	bptato <nincsnevem662@gmail.com>	2025-05-04 17:00:49 +0200
committer	bptato <nincsnevem662@gmail.com>	2025-05-04 17:06:47 +0200
commit	bee4f12b0348e8893d62a01ce027b1550bb6ef09 (patch)
tree	cccddd33c62ed0e8a0487944648331aa06a95a93
parent	9b184b31eb916b013a4d501a5d1e9ff8460d3fee (diff)
download	chawan-bee4f12b0348e8893d62a01ce027b1550bb6ef09.tar.gz