summary refs log tree commit diff stats
path: root/tinyc/lib
diff options
context:
space:
mode:
authorDmitry Atamanov <data-man@users.noreply.github.com>2017-10-28 10:25:56 +0300
committerAndreas Rumpf <rumpf_a@web.de>2017-10-28 09:25:56 +0200
commitd2c7d391c8b69a6a590a2f702ed58bea033f6325 (patch)
treec74a1b46e1166ddb87453ddc49cea84e1baaa5ab /tinyc/lib
parent9c00f6decd4453a4233450a60ccef05b20e9f24a (diff)
downloadNim-d2c7d391c8b69a6a590a2f702ed58bea033f6325.tar.gz
TinyC upgrade (#6593)
Diffstat (limited to 'tinyc/lib')
-rw-r--r--tinyc/lib/alloca-arm.S17
-rw-r--r--tinyc/lib/alloca86-bt.S14
-rw-r--r--tinyc/lib/alloca86.S8
-rw-r--r--tinyc/lib/alloca86_64-bt.S56
-rw-r--r--tinyc/lib/alloca86_64.S34
-rw-r--r--tinyc/lib/armeabi.c501
-rw-r--r--tinyc/lib/armflush.c58
-rw-r--r--tinyc/lib/bcheck.c359
-rw-r--r--tinyc/lib/lib-arm64.c664
-rw-r--r--tinyc/lib/libtcc1.c37
-rw-r--r--tinyc/lib/va_list.c65
11 files changed, 1667 insertions, 146 deletions
diff --git a/tinyc/lib/alloca-arm.S b/tinyc/lib/alloca-arm.S
new file mode 100644
index 000000000..68556e36d
--- /dev/null
+++ b/tinyc/lib/alloca-arm.S
@@ -0,0 +1,17 @@
+	.text
+	.align	2
+	.global	alloca
+	.type	alloca, %function
+alloca:
+#ifdef __TINYC__
+        .int 0xe060d00d
+        .int 0xe3cdd007
+        .int 0xe1a0000d
+        .int 0xe1a0f00e
+#else
+	rsb	sp, r0, sp
+	bic	sp, sp, #7
+	mov	r0, sp
+	mov	pc, lr
+#endif
+	.size	alloca, .-alloca
diff --git a/tinyc/lib/alloca86-bt.S b/tinyc/lib/alloca86-bt.S
index 994da2042..4f95cf134 100644
--- a/tinyc/lib/alloca86-bt.S
+++ b/tinyc/lib/alloca86-bt.S
@@ -1,7 +1,5 @@
 /* ---------------------------------------------- */
-/* alloca86b.S */
-
-#include "../config.h"
+/* alloca86-bt.S */
 
 .globl __bound_alloca
 
@@ -13,13 +11,13 @@ __bound_alloca:
     and     $-4,%eax
     jz      p6
 
-#ifdef TCC_TARGET_PE
+#ifdef _WIN32
 p4:
     cmp     $4096,%eax
-    jle     p5
+    jbe     p5
+    test    %eax,-4096(%esp)
     sub     $4096,%esp
     sub     $4096,%eax
-    test    %eax,(%esp)
     jmp p4
 
 p5:
@@ -42,4 +40,8 @@ p6:
     push    %edx
     ret
 
+/* mark stack as nonexecutable */
+#if defined __ELF__ && defined __linux__
+    .section    .note.GNU-stack,"",@progbits
+#endif
 /* ---------------------------------------------- */
diff --git a/tinyc/lib/alloca86.S b/tinyc/lib/alloca86.S
index fb208a0ba..bb7a2c24a 100644
--- a/tinyc/lib/alloca86.S
+++ b/tinyc/lib/alloca86.S
@@ -1,8 +1,6 @@
 /* ---------------------------------------------- */
 /* alloca86.S */
 
-#include "../config.h"
-
 .globl alloca
 
 alloca:
@@ -12,13 +10,13 @@ alloca:
     and     $-4,%eax
     jz      p3
 
-#ifdef TCC_TARGET_PE
+#ifdef _WIN32
 p1:
     cmp     $4096,%eax
-    jle     p2
+    jbe     p2
+    test    %eax,-4096(%esp)
     sub     $4096,%esp
     sub     $4096,%eax
-    test    %eax,(%esp)
     jmp p1
 p2:
 #endif
diff --git a/tinyc/lib/alloca86_64-bt.S b/tinyc/lib/alloca86_64-bt.S
new file mode 100644
index 000000000..4cbad90f8
--- /dev/null
+++ b/tinyc/lib/alloca86_64-bt.S
@@ -0,0 +1,56 @@
+/* ---------------------------------------------- */
+/* alloca86_64.S */
+
+.globl __bound_alloca
+__bound_alloca:
+
+#ifdef _WIN32
+    # bound checking is not implemented
+    pop     %rdx
+    mov     %rcx,%rax
+    add     $15,%rax
+    and     $-16,%rax
+    jz      p3
+
+p1:
+    cmp     $4096,%rax
+    jbe     p2
+    test    %rax,-4096(%rsp)
+    sub     $4096,%rsp
+    sub     $4096,%rax
+    jmp p1
+p2:
+
+    sub     %rax,%rsp
+    mov     %rsp,%rax
+    add     $32,%rax
+
+p3:
+    push    %rdx
+    ret
+#else
+    pop     %rdx
+    mov     %rdi,%rax
+    mov     %rax,%rsi	# size, a second parm to the __bound_new_region
+
+    add     $15,%rax
+    and     $-16,%rax
+    jz      p3
+
+
+    sub     %rax,%rsp
+    mov     %rsp,%rdi	# pointer, a first parm to the __bound_new_region
+    mov     %rsp,%rax
+
+    push    %rdx
+    push    %rax
+    call   __bound_new_region
+    pop     %rax
+    pop     %rdx
+
+p3:
+    push    %rdx
+    ret
+#endif
+
+/* ---------------------------------------------- */
diff --git a/tinyc/lib/alloca86_64.S b/tinyc/lib/alloca86_64.S
new file mode 100644
index 000000000..ae3c97de3
--- /dev/null
+++ b/tinyc/lib/alloca86_64.S
@@ -0,0 +1,34 @@
+/* ---------------------------------------------- */
+/* alloca86_64.S */
+
+.globl alloca
+
+alloca:
+    pop     %rdx
+#ifdef _WIN32
+    mov     %rcx,%rax
+#else
+    mov     %rdi,%rax
+#endif
+    add     $15,%rax
+    and     $-16,%rax
+    jz      p3
+
+#ifdef _WIN32
+p1:
+    cmp     $4096,%rax
+    jbe     p2
+    test    %rax,-4096(%rsp)
+    sub     $4096,%rsp
+    sub     $4096,%rax
+    jmp p1
+p2:
+#endif
+
+    sub     %rax,%rsp
+    mov     %rsp,%rax
+p3:
+    push    %rdx
+    ret
+
+/* ---------------------------------------------- */
diff --git a/tinyc/lib/armeabi.c b/tinyc/lib/armeabi.c
new file mode 100644
index 000000000..a59640dd0
--- /dev/null
+++ b/tinyc/lib/armeabi.c
@@ -0,0 +1,501 @@
+/* TCC ARM runtime EABI
+   Copyright (C) 2013 Thomas Preud'homme
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.*/
+
+#ifdef __TINYC__
+#define INT_MIN (-2147483647 - 1)
+#define INT_MAX 2147483647
+#define UINT_MAX 0xffffffff
+#define LONG_MIN (-2147483647L - 1)
+#define LONG_MAX 2147483647L
+#define ULONG_MAX 0xffffffffUL
+#define LLONG_MAX 9223372036854775807LL
+#define LLONG_MIN (-9223372036854775807LL - 1)
+#define ULLONG_MAX 0xffffffffffffffffULL
+#else
+#include <limits.h>
+#endif
+
+/* We rely on the little endianness and EABI calling convention for this to
+   work */
+
+typedef struct double_unsigned_struct {
+    unsigned low;
+    unsigned high;
+} double_unsigned_struct;
+
+typedef struct unsigned_int_struct {
+    unsigned low;
+    int high;
+} unsigned_int_struct;
+
+#define REGS_RETURN(name, type) \
+    void name ## _return(type ret) {}
+
+
+/* Float helper functions */
+
+#define FLOAT_EXP_BITS 8
+#define FLOAT_FRAC_BITS 23
+
+#define DOUBLE_EXP_BITS 11
+#define DOUBLE_FRAC_BITS 52
+
+#define ONE_EXP(type) ((1 << (type ## _EXP_BITS - 1)) - 1)
+
+REGS_RETURN(unsigned_int_struct, unsigned_int_struct)
+REGS_RETURN(double_unsigned_struct, double_unsigned_struct)
+
+/* float -> integer: (sign) 1.fraction x 2^(exponent - exp_for_one) */
+
+
+/* float to [unsigned] long long conversion */
+#define DEFINE__AEABI_F2XLZ(name, with_sign)                                 \
+void __aeabi_ ## name(unsigned val)                                          \
+{                                                                            \
+    int exp, high_shift, sign;                                               \
+    double_unsigned_struct ret;                                              \
+                                                                             \
+    /* compute sign */                                                       \
+    sign = val >> 31;                                                        \
+                                                                             \
+    /* compute real exponent */                                              \
+    exp = val >> FLOAT_FRAC_BITS;                                            \
+    exp &= (1 << FLOAT_EXP_BITS) - 1;                                        \
+    exp -= ONE_EXP(FLOAT);                                                   \
+                                                                             \
+    /* undefined behavior if truncated value cannot be represented */        \
+    if (with_sign) {                                                         \
+        if (exp > 62) /* |val| too big, double cannot represent LLONG_MAX */ \
+            return;                                                          \
+    } else {                                                                 \
+        if ((sign && exp >= 0) || exp > 63) /* if val < 0 || val too big */  \
+            return;                                                          \
+    }                                                                        \
+                                                                             \
+    val &= (1 << FLOAT_FRAC_BITS) - 1;                                       \
+    if (exp >= 32) {                                                         \
+        ret.high = 1 << (exp - 32);                                          \
+        if (exp - 32 >= FLOAT_FRAC_BITS) {                                   \
+            ret.high |= val << (exp - 32 - FLOAT_FRAC_BITS);                 \
+            ret.low = 0;                                                     \
+        } else {                                                             \
+            high_shift = FLOAT_FRAC_BITS - (exp - 32);                       \
+            ret.high |= val >> high_shift;                                   \
+            ret.low = val << (32 - high_shift);                              \
+        }                                                                    \
+    } else {                                                                 \
+        ret.high = 0;                                                        \
+        ret.low = 1 << exp;                                                  \
+        if (exp > FLOAT_FRAC_BITS)                                           \
+            ret.low |= val << (exp - FLOAT_FRAC_BITS);                       \
+        else                                                                 \
+            ret.low |= val >> (FLOAT_FRAC_BITS - exp);                       \
+    }                                                                        \
+                                                                             \
+    /* encode negative integer using 2's complement */                       \
+    if (with_sign && sign) {                                                 \
+        ret.low = ~ret.low;                                                  \
+        ret.high = ~ret.high;                                                \
+        if (ret.low == UINT_MAX) {                                           \
+            ret.low = 0;                                                     \
+            ret.high++;                                                      \
+        } else                                                               \
+            ret.low++;                                                       \
+    }                                                                        \
+                                                                             \
+    double_unsigned_struct_return(ret);                                      \
+}
+
+/* float to unsigned long long conversion */
+DEFINE__AEABI_F2XLZ(f2ulz, 0)
+
+/* float to long long conversion */
+DEFINE__AEABI_F2XLZ(f2lz, 1)
+
+/* double to [unsigned] long long conversion */
+#define DEFINE__AEABI_D2XLZ(name, with_sign)                                 \
+void __aeabi_ ## name(double_unsigned_struct val)                            \
+{                                                                            \
+    int exp, high_shift, sign;                                               \
+    double_unsigned_struct ret;                                              \
+                                                                             \
+    /* compute sign */                                                       \
+    sign = val.high >> 31;                                                   \
+                                                                             \
+    /* compute real exponent */                                              \
+    exp = (val.high >> (DOUBLE_FRAC_BITS - 32));                             \
+    exp &= (1 << DOUBLE_EXP_BITS) - 1;                                       \
+    exp -= ONE_EXP(DOUBLE);                                                  \
+                                                                             \
+    /* undefined behavior if truncated value cannot be represented */        \
+    if (with_sign) {                                                         \
+        if (exp > 62) /* |val| too big, double cannot represent LLONG_MAX */ \
+            return;                                                          \
+    } else {                                                                 \
+        if ((sign && exp >= 0) || exp > 63) /* if val < 0 || val too big */  \
+            return;                                                          \
+    }                                                                        \
+                                                                             \
+    val.high &= (1 << (DOUBLE_FRAC_BITS - 32)) - 1;                          \
+    if (exp >= 32) {                                                         \
+        ret.high = 1 << (exp - 32);                                          \
+        if (exp >= DOUBLE_FRAC_BITS) {                                       \
+            high_shift = exp - DOUBLE_FRAC_BITS;                             \
+            ret.high |= val.high << high_shift;                              \
+            ret.high |= val.low >> (32 - high_shift);                        \
+            ret.low = val.low << high_shift;                                 \
+        } else {                                                             \
+            high_shift = DOUBLE_FRAC_BITS - exp;                             \
+            ret.high |= val.high >> high_shift;                              \
+            ret.low = val.high << (32 - high_shift);                         \
+            ret.low |= val.low >> high_shift;                                \
+        }                                                                    \
+    } else {                                                                 \
+        ret.high = 0;                                                        \
+        ret.low = 1 << exp;                                                  \
+        if (exp > DOUBLE_FRAC_BITS - 32) {                                   \
+            high_shift = exp - DOUBLE_FRAC_BITS - 32;                        \
+            ret.low |= val.high << high_shift;                               \
+            ret.low |= val.low >> (32 - high_shift);                         \
+        } else                                                               \
+            ret.low |= val.high >> (DOUBLE_FRAC_BITS - 32 - exp);            \
+    }                                                                        \
+                                                                             \
+    /* encode negative integer using 2's complement */                       \
+    if (with_sign && sign) {                                                 \
+        ret.low = ~ret.low;                                                  \
+        ret.high = ~ret.high;                                                \
+        if (ret.low == UINT_MAX) {                                           \
+            ret.low = 0;                                                     \
+            ret.high++;                                                      \
+        } else                                                               \
+            ret.low++;                                                       \
+    }                                                                        \
+                                                                             \
+    double_unsigned_struct_return(ret);                                      \
+}
+
+/* double to unsigned long long conversion */
+DEFINE__AEABI_D2XLZ(d2ulz, 0)
+
+/* double to long long conversion */
+DEFINE__AEABI_D2XLZ(d2lz, 1)
+
+/* long long to float conversion */
+#define DEFINE__AEABI_XL2F(name, with_sign)                             \
+unsigned __aeabi_ ## name(unsigned long long v)                         \
+{                                                                       \
+    int s /* shift */, flb /* first lost bit */, sign = 0;              \
+    unsigned p = 0 /* power */, ret;                                    \
+    double_unsigned_struct val;                                         \
+                                                                        \
+    /* fraction in negative float is encoded in 1's complement */       \
+    if (with_sign && (v & (1ULL << 63))) {                              \
+        sign = 1;                                                       \
+        v = ~v + 1;                                                     \
+    }                                                                   \
+    val.low = v;                                                        \
+    val.high = v >> 32;                                                 \
+    /* fill fraction bits */                                            \
+    for (s = 31, p = 1 << 31; p && !(val.high & p); s--, p >>= 1);      \
+    if (p) {                                                            \
+        ret = val.high & (p - 1);                                       \
+        if (s < FLOAT_FRAC_BITS) {                                      \
+            ret <<= FLOAT_FRAC_BITS - s;                                \
+            ret |= val.low >> (32 - (FLOAT_FRAC_BITS - s));             \
+            flb = (val.low >> (32 - (FLOAT_FRAC_BITS - s - 1))) & 1;    \
+        } else {                                                        \
+            flb = (ret >> (s - FLOAT_FRAC_BITS - 1)) & 1;               \
+            ret >>= s - FLOAT_FRAC_BITS;                                \
+        }                                                               \
+        s += 32;                                                        \
+    } else {                                                            \
+        for (s = 31, p = 1 << 31; p && !(val.low & p); s--, p >>= 1);   \
+        if (p) {                                                        \
+            ret = val.low & (p - 1);                                    \
+            if (s <= FLOAT_FRAC_BITS) {                                 \
+                ret <<= FLOAT_FRAC_BITS - s;                            \
+                flb = 0;                                                \
+	    } else {                                                    \
+                flb = (ret >> (s - FLOAT_FRAC_BITS - 1)) & 1;           \
+                ret >>= s - FLOAT_FRAC_BITS;                            \
+	    }                                                           \
+        } else                                                          \
+            return 0;                                                   \
+    }                                                                   \
+    if (flb)                                                            \
+        ret++;                                                          \
+                                                                        \
+    /* fill exponent bits */                                            \
+    ret |= (s + ONE_EXP(FLOAT)) << FLOAT_FRAC_BITS;                     \
+                                                                        \
+    /* fill sign bit */                                                 \
+    ret |= sign << 31;                                                  \
+                                                                        \
+    return ret;                                                         \
+}
+
+/* unsigned long long to float conversion */
+DEFINE__AEABI_XL2F(ul2f, 0)
+
+/* long long to float conversion */
+DEFINE__AEABI_XL2F(l2f, 1)
+
+/* long long to double conversion */
+#define __AEABI_XL2D(name, with_sign)                                   \
+void __aeabi_ ## name(unsigned long long v)                             \
+{                                                                       \
+    int s /* shift */, high_shift, sign = 0;                            \
+    unsigned tmp, p = 0;                                                \
+    double_unsigned_struct val, ret;                                    \
+                                                                        \
+    /* fraction in negative float is encoded in 1's complement */       \
+    if (with_sign && (v & (1ULL << 63))) {                              \
+        sign = 1;                                                       \
+        v = ~v + 1;                                                     \
+    }                                                                   \
+    val.low = v;                                                        \
+    val.high = v >> 32;                                                 \
+                                                                        \
+    /* fill fraction bits */                                            \
+    for (s = 31, p = 1 << 31; p && !(val.high & p); s--, p >>= 1);      \
+    if (p) {                                                            \
+        tmp = val.high & (p - 1);                                       \
+        if (s < DOUBLE_FRAC_BITS - 32) {                                \
+            high_shift = DOUBLE_FRAC_BITS - 32 - s;                     \
+            ret.high = tmp << high_shift;                               \
+            ret.high |= val.low >> (32 - high_shift);                   \
+            ret.low = val.low << high_shift;                            \
+        } else {                                                        \
+            high_shift = s - (DOUBLE_FRAC_BITS - 32);                   \
+            ret.high = tmp >> high_shift;                               \
+            ret.low = tmp << (32 - high_shift);                         \
+            ret.low |= val.low >> high_shift;                           \
+            if ((val.low >> (high_shift - 1)) & 1) {                    \
+                if (ret.low == UINT_MAX) {                              \
+                    ret.high++;                                         \
+                    ret.low = 0;                                        \
+		} else                                                  \
+                    ret.low++;                                          \
+            }                                                           \
+        }                                                               \
+        s += 32;                                                        \
+    } else {                                                            \
+        for (s = 31, p = 1 << 31; p && !(val.low & p); s--, p >>= 1);   \
+        if (p) {                                                        \
+            tmp = val.low & (p - 1);                                    \
+            if (s <= DOUBLE_FRAC_BITS - 32) {                           \
+                high_shift = DOUBLE_FRAC_BITS - 32 - s;                 \
+                ret.high = tmp << high_shift;                           \
+                ret.low = 0;                                            \
+	    } else {                                                    \
+                high_shift = s - (DOUBLE_FRAC_BITS - 32);               \
+                ret.high = tmp >> high_shift;                           \
+                ret.low = tmp << (32 - high_shift);                     \
+            }                                                           \
+        } else {                                                        \
+            ret.high = ret.low = 0;                                     \
+            double_unsigned_struct_return(ret);                         \
+        }                                                               \
+    }                                                                   \
+                                                                        \
+    /* fill exponent bits */                                            \
+    ret.high |= (s + ONE_EXP(DOUBLE)) << (DOUBLE_FRAC_BITS - 32);       \
+                                                                        \
+    /* fill sign bit */                                                 \
+    ret.high |= sign << 31;                                             \
+                                                                        \
+    double_unsigned_struct_return(ret);                                 \
+}
+
+/* unsigned long long to double conversion */
+__AEABI_XL2D(ul2d, 0)
+
+/* long long to double conversion */
+__AEABI_XL2D(l2d, 1)
+
+
+/* Long long helper functions */
+
+/* TODO: add error in case of den == 0 (see §4.3.1 and §4.3.2) */
+
+#define define_aeabi_xdivmod_signed_type(basetype, type) \
+typedef struct type {                                    \
+    basetype quot;                                       \
+    unsigned basetype rem;                               \
+} type
+
+#define define_aeabi_xdivmod_unsigned_type(basetype, type) \
+typedef struct type {                                      \
+    basetype quot;                                         \
+    basetype rem;                                          \
+} type
+
+#define AEABI_UXDIVMOD(name,type, rettype, typemacro)                     \
+static inline rettype aeabi_ ## name (type num, type den)                 \
+{                                                                         \
+    rettype ret;                                                          \
+    type quot = 0;                                                        \
+                                                                          \
+    /* Increase quotient while it is less than numerator */               \
+    while (num >= den) {                                                  \
+        type q = 1;                                                       \
+                                                                          \
+        /* Find closest power of two */                                   \
+        while ((q << 1) * den <= num && q * den <= typemacro ## _MAX / 2) \
+            q <<= 1;                                                      \
+                                                                          \
+        /* Compute difference between current quotient and numerator */   \
+        num -= q * den;                                                   \
+        quot += q;                                                        \
+    }                                                                     \
+    ret.quot = quot;                                                      \
+    ret.rem = num;                                                        \
+    return ret;                                                           \
+}
+
+#define __AEABI_XDIVMOD(name, type, uiname, rettype, urettype, typemacro)     \
+void __aeabi_ ## name(type numerator, type denominator)                       \
+{                                                                             \
+    unsigned type num, den;                                                   \
+    urettype uxdiv_ret;                                                       \
+    rettype ret;                                                              \
+                                                                              \
+    if (numerator >= 0)                                                       \
+      num = numerator;                                                        \
+    else                                                                      \
+      num = 0 - numerator;                                                    \
+    if (denominator >= 0)                                                     \
+      den = denominator;                                                      \
+    else                                                                      \
+      den = 0 - denominator;                                                  \
+    uxdiv_ret = aeabi_ ## uiname(num, den);                                   \
+    /* signs differ */                                                        \
+    if ((numerator & typemacro ## _MIN) != (denominator & typemacro ## _MIN)) \
+        ret.quot = 0 - uxdiv_ret.quot;                                        \
+    else                                                                      \
+        ret.quot = uxdiv_ret.quot;                                            \
+    if (numerator < 0)                                                        \
+        ret.rem = 0 - uxdiv_ret.rem;                                          \
+    else                                                                      \
+        ret.rem = uxdiv_ret.rem;                                              \
+                                                                              \
+    rettype ## _return(ret);                                                  \
+}
+
+define_aeabi_xdivmod_signed_type(long long, lldiv_t);
+define_aeabi_xdivmod_unsigned_type(unsigned long long, ulldiv_t);
+define_aeabi_xdivmod_signed_type(int, idiv_t);
+define_aeabi_xdivmod_unsigned_type(unsigned, uidiv_t);
+
+REGS_RETURN(lldiv_t, lldiv_t)
+REGS_RETURN(ulldiv_t, ulldiv_t)
+REGS_RETURN(idiv_t, idiv_t)
+REGS_RETURN(uidiv_t, uidiv_t)
+
+AEABI_UXDIVMOD(uldivmod, unsigned long long, ulldiv_t, ULLONG)
+
+__AEABI_XDIVMOD(ldivmod, long long, uldivmod, lldiv_t, ulldiv_t, LLONG)
+
+void __aeabi_uldivmod(unsigned long long num, unsigned long long den)
+{
+    ulldiv_t_return(aeabi_uldivmod(num, den));
+}
+
+void __aeabi_llsl(double_unsigned_struct val, int shift)
+{
+    double_unsigned_struct ret;
+
+    if (shift >= 32) {
+        val.high = val.low;
+        val.low = 0;
+        shift -= 32;
+    }
+    if (shift > 0) {
+        ret.low = val.low << shift;
+        ret.high = (val.high << shift) | (val.low >> (32 - shift));
+        double_unsigned_struct_return(ret);
+	return;
+    }
+    double_unsigned_struct_return(val);
+}
+
+#define aeabi_lsr(val, shift, fill, type)                          \
+    type ## _struct ret;                                           \
+                                                                   \
+    if (shift >= 32) {                                             \
+        val.low = val.high;                                        \
+        val.high = fill;                                           \
+        shift -= 32;                                               \
+    }                                                              \
+    if (shift > 0) {                                               \
+        ret.high = val.high >> shift;                              \
+        ret.low = (val.high << (32 - shift)) | (val.low >> shift); \
+        type ## _struct_return(ret);                               \
+	return;                                                    \
+    }                                                              \
+    type ## _struct_return(val);
+
+void __aeabi_llsr(double_unsigned_struct val, int shift)
+{
+    aeabi_lsr(val, shift, 0, double_unsigned);
+}
+
+void __aeabi_lasr(unsigned_int_struct val, int shift)
+{
+    aeabi_lsr(val, shift, val.high >> 31, unsigned_int);
+}
+
+
+/* Integer division functions */
+
+AEABI_UXDIVMOD(uidivmod, unsigned, uidiv_t, UINT)
+
+int __aeabi_idiv(int numerator, int denominator)
+{
+    unsigned num, den;
+    uidiv_t ret;
+
+    if (numerator >= 0)
+        num = numerator;
+    else
+        num = 0 - numerator;
+    if (denominator >= 0)
+        den = denominator;
+    else
+        den = 0 - denominator;
+    ret = aeabi_uidivmod(num, den);
+    if ((numerator & INT_MIN) != (denominator & INT_MIN)) /* signs differ */
+        ret.quot *= -1;
+    return ret.quot;
+}
+
+unsigned __aeabi_uidiv(unsigned num, unsigned den)
+{
+    return aeabi_uidivmod(num, den).quot;
+}
+
+__AEABI_XDIVMOD(idivmod, int, uidivmod, idiv_t, uidiv_t, INT)
+
+void __aeabi_uidivmod(unsigned num, unsigned den)
+{
+    uidiv_t_return(aeabi_uidivmod(num, den));
+}
diff --git a/tinyc/lib/armflush.c b/tinyc/lib/armflush.c
new file mode 100644
index 000000000..eae32605f
--- /dev/null
+++ b/tinyc/lib/armflush.c
@@ -0,0 +1,58 @@
+/* armflush.c - flush the instruction cache
+
+   __clear_cache is used in tccrun.c,  It is a built-in
+   intrinsic with gcc.  However tcc in order to compile
+   itself needs this function */
+
+#ifdef __TINYC__
+
+/* syscall wrapper */
+unsigned syscall(unsigned syscall_nr, ...);
+
+/* arm-tcc supports only fake asm currently */
+__asm__(
+    ".global syscall\n"
+    "syscall:\n"
+    ".int 0xe92d4080\n"  // push    {r7, lr}
+    ".int 0xe1a07000\n"  // mov     r7, r0
+    ".int 0xe1a00001\n"  // mov     r0, r1
+    ".int 0xe1a01002\n"  // mov     r1, r2
+    ".int 0xe1a02003\n"  // mov     r2, r3
+    ".int 0xef000000\n"  // svc     0x00000000
+    ".int 0xe8bd8080\n"  // pop     {r7, pc}
+    );
+
+/* from unistd.h: */
+#if defined(__thumb__) || defined(__ARM_EABI__)
+# define __NR_SYSCALL_BASE      0x0
+#else
+# define __NR_SYSCALL_BASE      0x900000
+#endif
+#define __ARM_NR_BASE           (__NR_SYSCALL_BASE+0x0f0000)
+#define __ARM_NR_cacheflush     (__ARM_NR_BASE+2)
+
+#else
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <stdio.h>
+
+#endif
+
+/* Flushing for tccrun */
+void __clear_cache(void *beginning, void *end)
+{
+/* __ARM_NR_cacheflush is kernel private and should not be used in user space.
+ * However, there is no ARM asm parser in tcc so we use it for now */
+#if 1
+    syscall(__ARM_NR_cacheflush, beginning, end, 0);
+#else
+    __asm__ ("push {r7}\n\t"
+             "mov r7, #0xf0002\n\t"
+             "mov r2, #0\n\t"
+             "swi 0\n\t"
+             "pop {r7}\n\t"
+             "ret");
+#endif
+}
diff --git a/tinyc/lib/bcheck.c b/tinyc/lib/bcheck.c
index c59d04eb8..90f0ad2c3 100644
--- a/tinyc/lib/bcheck.c
+++ b/tinyc/lib/bcheck.c
@@ -21,60 +21,83 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
-#if !defined(__FreeBSD__) && !defined(__DragonFly__) && !defined(__OpenBSD__)
+
+#if !defined(__FreeBSD__) \
+ && !defined(__FreeBSD_kernel__) \
+ && !defined(__DragonFly__) \
+ && !defined(__OpenBSD__) \
+ && !defined(__NetBSD__)
 #include <malloc.h>
 #endif
 
-//#define BOUND_DEBUG
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+/* #define BOUND_DEBUG */
+
+#ifdef BOUND_DEBUG
+ #define dprintf(a...) fprintf(a)
+#else
+ #define dprintf(a...)
+#endif
 
 /* define so that bound array is static (faster, but use memory if
    bound checking not used) */
-//#define BOUND_STATIC
+/* #define BOUND_STATIC */
 
 /* use malloc hooks. Currently the code cannot be reliable if no hooks */
 #define CONFIG_TCC_MALLOC_HOOKS
-
 #define HAVE_MEMALIGN
 
-#if defined(__FreeBSD__) || defined(__DragonFly__) || defined(__dietlibc__) \
-    || defined(__UCLIBC__) || defined(__OpenBSD__)
-#warning Bound checking not fully supported in this environment.
+#if defined(__FreeBSD__) \
+ || defined(__FreeBSD_kernel__) \
+ || defined(__DragonFly__) \
+ || defined(__OpenBSD__) \
+ || defined(__NetBSD__) \
+ || defined(__dietlibc__) \
+ || defined(_WIN32)
+//#warning Bound checking does not support malloc (etc.) in this environment.
 #undef CONFIG_TCC_MALLOC_HOOKS
 #undef HAVE_MEMALIGN
 #endif
 
 #define BOUND_T1_BITS 13
 #define BOUND_T2_BITS 11
-#define BOUND_T3_BITS (32 - BOUND_T1_BITS - BOUND_T2_BITS)
+#define BOUND_T3_BITS (sizeof(size_t)*8 - BOUND_T1_BITS - BOUND_T2_BITS)
+#define BOUND_E_BITS  (sizeof(size_t))
 
-#define BOUND_T1_SIZE (1 << BOUND_T1_BITS)
-#define BOUND_T2_SIZE (1 << BOUND_T2_BITS)
-#define BOUND_T3_SIZE (1 << BOUND_T3_BITS)
-#define BOUND_E_BITS  4
+#define BOUND_T1_SIZE ((size_t)1 << BOUND_T1_BITS)
+#define BOUND_T2_SIZE ((size_t)1 << BOUND_T2_BITS)
+#define BOUND_T3_SIZE ((size_t)1 << BOUND_T3_BITS)
 
 #define BOUND_T23_BITS (BOUND_T2_BITS + BOUND_T3_BITS)
-#define BOUND_T23_SIZE (1 << BOUND_T23_BITS)
+#define BOUND_T23_SIZE ((size_t)1 << BOUND_T23_BITS)
 
 
 /* this pointer is generated when bound check is incorrect */
 #define INVALID_POINTER ((void *)(-2))
 /* size of an empty region */
-#define EMPTY_SIZE        0xffffffff
+#define EMPTY_SIZE  ((size_t)(-1))
 /* size of an invalid region */
 #define INVALID_SIZE      0
 
 typedef struct BoundEntry {
-    unsigned long start;
-    unsigned long size;
+    size_t start;
+    size_t size;
     struct BoundEntry *next;
-    unsigned long is_invalid; /* true if pointers outside region are invalid */
+    size_t is_invalid; /* true if pointers outside region are invalid */
 } BoundEntry;
 
 /* external interface */
 void __bound_init(void);
-void __bound_new_region(void *p, unsigned long size);
+void __bound_new_region(void *p, size_t size);
 int __bound_delete_region(void *p);
 
+#ifdef __attribute__
+  /* an __attribute__ macro is defined in the system headers */
+  #undef __attribute__ 
+#endif
 #define FASTCALL __attribute__((regparm(3)))
 
 void *__bound_malloc(size_t size, const void *caller);
@@ -93,16 +116,13 @@ static void *saved_realloc_hook;
 static void *saved_memalign_hook;
 #endif
 
-/* linker definitions */
-extern char _end;
-
 /* TCC definitions */
 extern char __bounds_start; /* start of static bounds table */
 /* error message, just for TCC */
 const char *__bound_error_msg;
 
 /* runtime error output */
-extern void rt_error(unsigned long pc, const char *fmt, ...);
+extern void rt_error(size_t pc, const char *fmt, ...);
 
 #ifdef BOUND_STATIC
 static BoundEntry *__bound_t1[BOUND_T1_SIZE]; /* page table */
@@ -114,12 +134,12 @@ static BoundEntry *__bound_invalid_t2; /* invalid page, for invalid pointers */
 
 static BoundEntry *__bound_find_region(BoundEntry *e1, void *p)
 {
-    unsigned long addr, tmp;
+    size_t addr, tmp;
     BoundEntry *e;
 
     e = e1;
     while (e != NULL) {
-        addr = (unsigned long)p;
+        addr = (size_t)p;
         addr -= e->start;
         if (addr <= e->size) {
             /* put region at the head */
@@ -144,7 +164,8 @@ static BoundEntry *__bound_find_region(BoundEntry *e1, void *p)
 static void bound_error(const char *fmt, ...)
 {
     __bound_error_msg = fmt;
-    *(int *)0 = 0; /* force a runtime error */
+    fprintf(stderr,"%s %s: %s\n", __FILE__, __FUNCTION__, fmt);
+    *(void **)0 = 0; /* force a runtime error */
 }
 
 static void bound_alloc_error(void)
@@ -152,18 +173,17 @@ static void bound_alloc_error(void)
     bound_error("not enough memory for bound checking code");
 }
 
-/* currently, tcc cannot compile that because we use GNUC extensions */
-#if !defined(__TINYC__)
-
 /* return '(p + offset)' for pointer arithmetic (a pointer can reach
    the end of a region in this case */
-void * FASTCALL __bound_ptr_add(void *p, int offset)
+void * FASTCALL __bound_ptr_add(void *p, size_t offset)
 {
-    unsigned long addr = (unsigned long)p;
+    size_t addr = (size_t)p;
     BoundEntry *e;
-#if defined(BOUND_DEBUG)
-    printf("add: 0x%x %d\n", (int)p, offset);
-#endif
+
+    dprintf(stderr, "%s %s: %p %x\n",
+        __FILE__, __FUNCTION__, p, (unsigned)offset);
+
+    __bound_init();
 
     e = __bound_t1[addr >> (BOUND_T2_BITS + BOUND_T3_BITS)];
     e = (BoundEntry *)((char *)e + 
@@ -172,22 +192,29 @@ void * FASTCALL __bound_ptr_add(void *p, int offset)
     addr -= e->start;
     if (addr > e->size) {
         e = __bound_find_region(e, p);
-        addr = (unsigned long)p - e->start;
+        addr = (size_t)p - e->start;
     }
     addr += offset;
-    if (addr > e->size)
+    if (addr >= e->size) {
+	fprintf(stderr,"%s %s: %p is outside of the region\n",
+            __FILE__, __FUNCTION__, p + offset);
         return INVALID_POINTER; /* return an invalid pointer */
+    }
     return p + offset;
 }
 
 /* return '(p + offset)' for pointer indirection (the resulting must
    be strictly inside the region */
 #define BOUND_PTR_INDIR(dsize)                                          \
-void * FASTCALL __bound_ptr_indir ## dsize (void *p, int offset)        \
+void * FASTCALL __bound_ptr_indir ## dsize (void *p, size_t offset)     \
 {                                                                       \
-    unsigned long addr = (unsigned long)p;                              \
+    size_t addr = (size_t)p;                                            \
     BoundEntry *e;                                                      \
                                                                         \
+    dprintf(stderr, "%s %s: %p %x start\n",                             \
+        __FILE__, __FUNCTION__, p, (unsigned)offset);	                \
+									\
+    __bound_init();							\
     e = __bound_t1[addr >> (BOUND_T2_BITS + BOUND_T3_BITS)];            \
     e = (BoundEntry *)((char *)e +                                      \
                        ((addr >> (BOUND_T3_BITS - BOUND_E_BITS)) &      \
@@ -195,30 +222,47 @@ void * FASTCALL __bound_ptr_indir ## dsize (void *p, int offset)        \
     addr -= e->start;                                                   \
     if (addr > e->size) {                                               \
         e = __bound_find_region(e, p);                                  \
-        addr = (unsigned long)p - e->start;                             \
+        addr = (size_t)p - e->start;                                    \
     }                                                                   \
     addr += offset + dsize;                                             \
-    if (addr > e->size)                                                 \
+    if (addr > e->size) {                                               \
+	fprintf(stderr,"%s %s: %p is outside of the region\n",          \
+            __FILE__, __FUNCTION__, p + offset);                        \
         return INVALID_POINTER; /* return an invalid pointer */         \
+    }									\
+    dprintf(stderr, "%s %s: return p+offset = %p\n",                    \
+        __FILE__, __FUNCTION__, p + offset);                            \
     return p + offset;                                                  \
 }
 
-#ifdef __i386__
+BOUND_PTR_INDIR(1)
+BOUND_PTR_INDIR(2)
+BOUND_PTR_INDIR(4)
+BOUND_PTR_INDIR(8)
+BOUND_PTR_INDIR(12)
+BOUND_PTR_INDIR(16)
+
+#if defined(__GNUC__) && (__GNUC__ >= 6)
+/*
+ * At least gcc 6.2 complains when __builtin_frame_address is used with
+ * nonzero argument.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wframe-address"
+#endif
+
 /* return the frame pointer of the caller */
 #define GET_CALLER_FP(fp)\
 {\
-    unsigned long *fp1;\
-    __asm__ __volatile__ ("movl %%ebp,%0" :"=g" (fp1));\
-    fp = fp1[0];\
+    fp = (size_t)__builtin_frame_address(1);\
 }
-#else
-#error put code to extract the calling frame pointer
-#endif
 
 /* called when entering a function to add all the local regions */
 void FASTCALL __bound_local_new(void *p1) 
 {
-    unsigned long addr, size, fp, *p = p1;
+    size_t addr, size, fp, *p = p1;
+
+    dprintf(stderr, "%s, %s start p1=%p\n", __FILE__, __FUNCTION__, p);
     GET_CALLER_FP(fp);
     for(;;) {
         addr = p[0];
@@ -229,12 +273,13 @@ void FASTCALL __bound_local_new(void *p1)
         p += 2;
         __bound_new_region((void *)addr, size);
     }
+    dprintf(stderr, "%s, %s end\n", __FILE__, __FUNCTION__);
 }
 
 /* called when leaving a function to delete all the local regions */
 void FASTCALL __bound_local_delete(void *p1) 
 {
-    unsigned long addr, fp, *p = p1;
+    size_t addr, fp, *p = p1;
     GET_CALLER_FP(fp);
     for(;;) {
         addr = p[0];
@@ -246,38 +291,14 @@ void FASTCALL __bound_local_delete(void *p1)
     }
 }
 
-#else
-
-void __bound_local_new(void *p) 
-{
-}
-void __bound_local_delete(void *p) 
-{
-}
-
-void *__bound_ptr_add(void *p, int offset)
-{
-    return p + offset;
-}
-
-#define BOUND_PTR_INDIR(dsize)                               \
-void *__bound_ptr_indir ## dsize (void *p, int offset)       \
-{                                                            \
-    return p + offset;                                       \
-}
+#if defined(__GNUC__) && (__GNUC__ >= 6)
+#pragma GCC diagnostic pop
 #endif
 
-BOUND_PTR_INDIR(1)
-BOUND_PTR_INDIR(2)
-BOUND_PTR_INDIR(4)
-BOUND_PTR_INDIR(8)
-BOUND_PTR_INDIR(12)
-BOUND_PTR_INDIR(16)
-
 static BoundEntry *__bound_new_page(void)
 {
     BoundEntry *page;
-    int i;
+    size_t i;
 
     page = libc_malloc(sizeof(BoundEntry) * BOUND_T2_SIZE);
     if (!page)
@@ -305,11 +326,11 @@ static void bound_free_entry(BoundEntry *e)
     libc_free(e);
 }
 
-static inline BoundEntry *get_page(int index)
+static BoundEntry *get_page(size_t index)
 {
     BoundEntry *page;
     page = __bound_t1[index];
-    if (page == __bound_empty_t2 || page == __bound_invalid_t2) {
+    if (!page || page == __bound_empty_t2 || page == __bound_invalid_t2) {
         /* create a new page if necessary */
         page = __bound_new_page();
         __bound_t1[index] = page;
@@ -318,11 +339,11 @@ static inline BoundEntry *get_page(int index)
 }
 
 /* mark a region as being invalid (can only be used during init) */
-static void mark_invalid(unsigned long addr, unsigned long size)
+static void mark_invalid(size_t addr, size_t size)
 {
-    unsigned long start, end;
+    size_t start, end;
     BoundEntry *page;
-    int t1_start, t1_end, i, j, t2_start, t2_end;
+    size_t t1_start, t1_end, i, j, t2_start, t2_end;
 
     start = addr;
     end = addr + size;
@@ -334,7 +355,7 @@ static void mark_invalid(unsigned long addr, unsigned long size)
         t2_end = 1 << (BOUND_T1_BITS + BOUND_T2_BITS);
 
 #if 0
-    printf("mark_invalid: start = %x %x\n", t2_start, t2_end);
+    dprintf(stderr, "mark_invalid: start = %x %x\n", t2_start, t2_end);
 #endif
     
     /* first we handle full pages */
@@ -373,10 +394,18 @@ static void mark_invalid(unsigned long addr, unsigned long size)
 
 void __bound_init(void)
 {
-    int i;
+    size_t i;
     BoundEntry *page;
-    unsigned long start, size;
-    int *p;
+    size_t start, size;
+    size_t *p;
+
+    static int inited;
+    if (inited)
+	return;
+
+    inited = 1;
+
+    dprintf(stderr, "%s, %s() start\n", __FILE__, __FUNCTION__);
 
     /* save malloc hooks and install bound check hooks */
     install_malloc_hooks();
@@ -402,29 +431,71 @@ void __bound_init(void)
     __bound_invalid_t2 = page;
 
     /* invalid pointer zone */
-    start = (unsigned long)INVALID_POINTER & ~(BOUND_T23_SIZE - 1);
+    start = (size_t)INVALID_POINTER & ~(BOUND_T23_SIZE - 1);
     size = BOUND_T23_SIZE;
     mark_invalid(start, size);
 
-#if !defined(__TINYC__) && defined(CONFIG_TCC_MALLOC_HOOKS)
+#if defined(CONFIG_TCC_MALLOC_HOOKS)
     /* malloc zone is also marked invalid. can only use that with
-       hooks because all libs should use the same malloc. The solution
-       would be to build a new malloc for tcc. */
-    start = (unsigned long)&_end;
+     * hooks because all libs should use the same malloc. The solution
+     * would be to build a new malloc for tcc.
+     *
+     * usually heap (= malloc zone) comes right after bss, i.e. after _end, but
+     * not always - either if we are running from under `tcc -b -run`, or if
+     * address space randomization is turned on(a), heap start will be separated
+     * from bss end.
+     *
+     * So sbrk(0) will be a good approximation for start_brk:
+     *
+     *   - if we are a separately compiled program, __bound_init() runs early,
+     *     and sbrk(0) should be equal or very near to start_brk(b) (in case other
+     *     constructors malloc something), or
+     *
+     *   - if we are running from under `tcc -b -run`, sbrk(0) will return
+     *     start of heap portion which is under this program control, and not
+     *     mark as invalid earlier allocated memory.
+     *
+     *
+     * (a) /proc/sys/kernel/randomize_va_space = 2, on Linux;
+     *     usually turned on by default.
+     *
+     * (b) on Linux >= v3.3, the alternative is to read
+     *     start_brk from /proc/self/stat
+     */
+    start = (size_t)sbrk(0);
     size = 128 * 0x100000;
     mark_invalid(start, size);
 #endif
 
     /* add all static bound check values */
-    p = (int *)&__bounds_start;
+    p = (size_t *)&__bounds_start;
     while (p[0] != 0) {
         __bound_new_region((void *)p[0], p[1]);
         p += 2;
     }
+
+    dprintf(stderr, "%s, %s() end\n\n", __FILE__, __FUNCTION__);
+}
+
+void __bound_main_arg(void **p)
+{
+    void *start = p;
+    while (*p++);
+
+    dprintf(stderr, "%s, %s calling __bound_new_region(%p %x)\n",
+            __FILE__, __FUNCTION__, start, (unsigned)((void *)p - start));
+
+    __bound_new_region(start, (void *) p - start);
+}
+
+void __bound_exit(void)
+{
+    dprintf(stderr, "%s, %s()\n", __FILE__, __FUNCTION__);
+    restore_malloc_hooks();
 }
 
 static inline void add_region(BoundEntry *e, 
-                              unsigned long start, unsigned long size)
+                              size_t start, size_t size)
 {
     BoundEntry *e1;
     if (e->start == 0) {
@@ -444,13 +515,18 @@ static inline void add_region(BoundEntry *e,
 }
 
 /* create a new region. It should not already exist in the region list */
-void __bound_new_region(void *p, unsigned long size)
+void __bound_new_region(void *p, size_t size)
 {
-    unsigned long start, end;
+    size_t start, end;
     BoundEntry *page, *e, *e2;
-    int t1_start, t1_end, i, t2_start, t2_end;
+    size_t t1_start, t1_end, i, t2_start, t2_end;
+
+    dprintf(stderr, "%s, %s(%p, %x) start\n",
+        __FILE__, __FUNCTION__, p, (unsigned)size);
 
-    start = (unsigned long)p;
+    __bound_init();
+
+    start = (size_t)p;
     end = start + size;
     t1_start = start >> (BOUND_T2_BITS + BOUND_T3_BITS);
     t1_end = end >> (BOUND_T2_BITS + BOUND_T3_BITS);
@@ -461,10 +537,7 @@ void __bound_new_region(void *p, unsigned long size)
         ((BOUND_T2_SIZE - 1) << BOUND_E_BITS);
     t2_end = (end >> (BOUND_T3_BITS - BOUND_E_BITS)) & 
         ((BOUND_T2_SIZE - 1) << BOUND_E_BITS);
-#ifdef BOUND_DEBUG
-    printf("new %lx %lx %x %x %x %x\n", 
-           start, end, t1_start, t1_end, t2_start, t2_end);
-#endif
+
 
     e = (BoundEntry *)((char *)page + t2_start);
     add_region(e, start, size);
@@ -506,16 +579,17 @@ void __bound_new_region(void *p, unsigned long size)
         }
         add_region(e, start, size);
     }
+
+    dprintf(stderr, "%s, %s end\n", __FILE__, __FUNCTION__);
 }
 
 /* delete a region */
-static inline void delete_region(BoundEntry *e, 
-                                 void *p, unsigned long empty_size)
+static inline void delete_region(BoundEntry *e, void *p, size_t empty_size)
 {
-    unsigned long addr;
+    size_t addr;
     BoundEntry *e1;
 
-    addr = (unsigned long)p;
+    addr = (size_t)p;
     addr -= e->start;
     if (addr <= e->size) {
         /* region found is first one */
@@ -539,7 +613,7 @@ static inline void delete_region(BoundEntry *e,
             /* region not found: do nothing */
             if (e == NULL)
                 break;
-            addr = (unsigned long)p - e->start;
+            addr = (size_t)p - e->start;
             if (addr <= e->size) {
                 /* found: remove entry */
                 e1->next = e->next;
@@ -554,11 +628,15 @@ static inline void delete_region(BoundEntry *e,
 /* return non zero if error */
 int __bound_delete_region(void *p)
 {
-    unsigned long start, end, addr, size, empty_size;
+    size_t start, end, addr, size, empty_size;
     BoundEntry *page, *e, *e2;
-    int t1_start, t1_end, t2_start, t2_end, i;
+    size_t t1_start, t1_end, t2_start, t2_end, i;
 
-    start = (unsigned long)p;
+    dprintf(stderr, "%s %s() start\n", __FILE__, __FUNCTION__);
+
+    __bound_init();
+
+    start = (size_t)p;
     t1_start = start >> (BOUND_T2_BITS + BOUND_T3_BITS);
     t2_start = (start >> (BOUND_T3_BITS - BOUND_E_BITS)) & 
         ((BOUND_T2_SIZE - 1) << BOUND_E_BITS);
@@ -570,7 +648,7 @@ int __bound_delete_region(void *p)
     if (addr > e->size)
         e = __bound_find_region(e, p);
     /* test if invalid region */
-    if (e->size == EMPTY_SIZE || (unsigned long)p != e->start) 
+    if (e->size == EMPTY_SIZE || (size_t)p != e->start) 
         return -1;
     /* compute the size we put in invalid regions */
     if (e->is_invalid)
@@ -616,7 +694,7 @@ int __bound_delete_region(void *p)
             }
         }
         /* last page */
-        page = get_page(t2_end);
+        page = get_page(t1_end);
         e2 = (BoundEntry *)((char *)page + t2_end);
         for(e=page;e<e2;e++) {
             e->start = 0;
@@ -624,14 +702,17 @@ int __bound_delete_region(void *p)
         }
         delete_region(e, p, empty_size);
     }
+
+    dprintf(stderr, "%s %s() end\n", __FILE__, __FUNCTION__);
+
     return 0;
 }
 
 /* return the size of the region starting at p, or EMPTY_SIZE if non
    existent region. */
-static unsigned long get_region_size(void *p)
+static size_t get_region_size(void *p)
 {
-    unsigned long addr = (unsigned long)p;
+    size_t addr = (size_t)p;
     BoundEntry *e;
 
     e = __bound_t1[addr >> (BOUND_T2_BITS + BOUND_T3_BITS)];
@@ -641,13 +722,16 @@ static unsigned long get_region_size(void *p)
     addr -= e->start;
     if (addr > e->size)
         e = __bound_find_region(e, p);
-    if (e->start != (unsigned long)p)
+    if (e->start != (size_t)p)
         return EMPTY_SIZE;
     return e->size;
 }
 
 /* patched memory functions */
 
+/* force compiler to perform stores coded up to this point */
+#define barrier()   __asm__ __volatile__ ("": : : "memory")
+
 static void install_malloc_hooks(void)
 {
 #ifdef CONFIG_TCC_MALLOC_HOOKS
@@ -659,6 +743,8 @@ static void install_malloc_hooks(void)
     __free_hook = __bound_free;
     __realloc_hook = __bound_realloc;
     __memalign_hook = __bound_memalign;
+
+    barrier();
 #endif
 }
 
@@ -669,6 +755,8 @@ static void restore_malloc_hooks(void)
     __free_hook = saved_free_hook;
     __realloc_hook = saved_realloc_hook;
     __memalign_hook = saved_memalign_hook;
+
+    barrier();
 #endif
 }
 
@@ -702,6 +790,10 @@ void *__bound_malloc(size_t size, const void *caller)
     
     if (!ptr)
         return NULL;
+
+    dprintf(stderr, "%s, %s calling __bound_new_region(%p, %x)\n",
+           __FILE__, __FUNCTION__, ptr, (unsigned)size);
+
     __bound_new_region(ptr, size);
     return ptr;
 }
@@ -731,6 +823,10 @@ void *__bound_memalign(size_t size, size_t align, const void *caller)
     
     if (!ptr)
         return NULL;
+
+    dprintf(stderr, "%s, %s calling __bound_new_region(%p, %x)\n",
+           __FILE__, __FUNCTION__, ptr, (unsigned)size);
+
     __bound_new_region(ptr, size);
     return ptr;
 }
@@ -748,7 +844,7 @@ void __bound_free(void *ptr, const void *caller)
 void *__bound_realloc(void *ptr, size_t size, const void *caller)
 {
     void *ptr1;
-    int old_size;
+    size_t old_size;
 
     if (size == 0) {
         __bound_free(ptr, caller);
@@ -783,23 +879,23 @@ void *__bound_calloc(size_t nmemb, size_t size)
 static void bound_dump(void)
 {
     BoundEntry *page, *e;
-    int i, j;
+    size_t i, j;
 
-    printf("region dump:\n");
+    fprintf(stderr, "region dump:\n");
     for(i=0;i<BOUND_T1_SIZE;i++) {
         page = __bound_t1[i];
         for(j=0;j<BOUND_T2_SIZE;j++) {
             e = page + j;
             /* do not print invalid or empty entries */
             if (e->size != EMPTY_SIZE && e->start != 0) {
-                printf("%08x:", 
+                fprintf(stderr, "%08x:", 
                        (i << (BOUND_T2_BITS + BOUND_T3_BITS)) + 
                        (j << BOUND_T3_BITS));
                 do {
-                    printf(" %08lx:%08lx", e->start, e->start + e->size);
+                    fprintf(stderr, " %08lx:%08lx", e->start, e->start + e->size);
                     e = e->next;
                 } while (e != NULL);
-                printf("\n");
+                fprintf(stderr, "\n");
             }
         }
     }
@@ -813,19 +909,28 @@ static void __bound_check(const void *p, size_t size)
 {
     if (size == 0)
         return;
-    p = __bound_ptr_add((void *)p, size);
+    p = __bound_ptr_add((void *)p, size - 1);
     if (p == INVALID_POINTER)
         bound_error("invalid pointer");
 }
 
 void *__bound_memcpy(void *dst, const void *src, size_t size)
 {
+    void* p;
+
+    dprintf(stderr, "%s %s: start, dst=%p src=%p size=%x\n",
+            __FILE__, __FUNCTION__, dst, src, (unsigned)size);
+
     __bound_check(dst, size);
     __bound_check(src, size);
     /* check also region overlap */
     if (src >= dst && src < dst + size)
         bound_error("overlapping regions in memcpy()");
-    return memcpy(dst, src, size);
+
+    p = memcpy(dst, src, size);
+
+    dprintf(stderr, "%s %s: end, p=%p\n", __FILE__, __FUNCTION__, p);
+    return p;
 }
 
 void *__bound_memmove(void *dst, const void *src, size_t size)
@@ -845,7 +950,7 @@ void *__bound_memset(void *dst, int c, size_t size)
 int __bound_strlen(const char *s)
 {
     const char *p;
-    int len;
+    size_t len;
 
     len = 0;
     for(;;) {
@@ -861,8 +966,14 @@ int __bound_strlen(const char *s)
 
 char *__bound_strcpy(char *dst, const char *src)
 {
-    int len;
+    size_t len;
+    void *p;
+
+    dprintf(stderr, "%s %s: strcpy start, dst=%p src=%p\n",
+            __FILE__, __FUNCTION__, dst, src);
     len = __bound_strlen(src);
-    return __bound_memcpy(dst, src, len + 1);
+    p = __bound_memcpy(dst, src, len + 1);
+    dprintf(stderr, "%s %s: strcpy end, p = %p\n",
+            __FILE__, __FUNCTION__, p);
+    return p;
 }
-
diff --git a/tinyc/lib/lib-arm64.c b/tinyc/lib/lib-arm64.c
new file mode 100644
index 000000000..b8fd9e85a
--- /dev/null
+++ b/tinyc/lib/lib-arm64.c
@@ -0,0 +1,664 @@
+/*
+ *  TCC runtime library for arm64.
+ *
+ *  Copyright (c) 2015 Edmund Grimley Evans
+ *
+ * Copying and distribution of this file, with or without modification,
+ * are permitted in any medium without royalty provided the copyright
+ * notice and this notice are preserved.  This file is offered as-is,
+ * without any warranty.
+ */
+
+#ifdef __TINYC__
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef int int32_t;
+typedef unsigned uint32_t;
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+void *memcpy(void*,void*,__SIZE_TYPE__);
+#else
+#include <stdint.h>
+#include <string.h>
+#endif
+
+void __clear_cache(void *beg, void *end)
+{
+    __arm64_clear_cache(beg, end);
+}
+
+typedef struct {
+    uint64_t x0, x1;
+} u128_t;
+
+static long double f3_zero(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_infinity(int sgn)
+{
+    long double f;
+    u128_t x = { 0, (uint64_t)sgn << 63 | 0x7fff000000000000 };
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_NaN(void)
+{
+    long double f;
+#if 0
+    // ARM's default NaN usually has just the top fraction bit set:
+    u128_t x = {  0, 0x7fff800000000000 };
+#else
+    // GCC's library sets all fraction bits:
+    u128_t x = { -1, 0x7fffffffffffffff };
+#endif
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int fp3_convert_NaN(long double *f, int sgn, u128_t mnt)
+{
+    u128_t x = { mnt.x0,
+                 mnt.x1 | 0x7fff800000000000 | (uint64_t)sgn << 63 };
+    memcpy(f, &x, 16);
+    return 1;
+}
+
+static int fp3_detect_NaNs(long double *f,
+                           int a_sgn, int a_exp, u128_t a,
+                           int b_sgn, int b_exp, u128_t b)
+{
+    // Detect signalling NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16) && !(a.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16) && !(b.x1 >> 47 & 1))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    // Detect quiet NaNs:
+    if (a_exp == 32767 && (a.x0 | a.x1 << 16))
+        return fp3_convert_NaN(f, a_sgn, a);
+    if (b_exp == 32767 && (b.x0 | b.x1 << 16))
+        return fp3_convert_NaN(f, b_sgn, b);
+
+    return 0;
+}
+
+static void f3_unpack(int *sgn, int32_t *exp, u128_t *mnt, long double f)
+{
+    u128_t x;
+    memcpy(&x, &f, 16);
+    *sgn = x.x1 >> 63;
+    *exp = x.x1 >> 48 & 32767;
+    x.x1 = x.x1 << 16 >> 16;
+    if (*exp)
+        x.x1 |= (uint64_t)1 << 48;
+    else
+        *exp = 1;
+    *mnt = x;
+}
+
+static u128_t f3_normalise(int32_t *exp, u128_t mnt)
+{
+    int sh;
+    if (!(mnt.x0 | mnt.x1))
+        return mnt;
+    if (!mnt.x1) {
+        mnt.x1 = mnt.x0;
+        mnt.x0 = 0;
+        *exp -= 64;
+    }
+    for (sh = 32; sh; sh >>= 1) {
+        if (!(mnt.x1 >> (64 - sh))) {
+            mnt.x1 = mnt.x1 << sh | mnt.x0 >> (64 - sh);
+            mnt.x0 = mnt.x0 << sh;
+            *exp -= sh;
+        }
+    }
+    return mnt;
+}
+
+static u128_t f3_sticky_shift(int32_t sh, u128_t x)
+{
+  if (sh >= 128) {
+      x.x0 = !!(x.x0 | x.x1);
+      x.x1 = 0;
+      return x;
+  }
+  if (sh >= 64) {
+      x.x0 = x.x1 | !!x.x0;
+      x.x1 = 0;
+      sh -= 64;
+  }
+  if (sh > 0) {
+      x.x0 = x.x0 >> sh | x.x1 << (64 - sh) | !!(x.x0 << (64 - sh));
+      x.x1 = x.x1 >> sh;
+  }
+  return x;
+}
+
+static long double f3_round(int sgn, int32_t exp, u128_t x)
+{
+    long double f;
+    int error;
+
+    if (exp > 0) {
+        x = f3_sticky_shift(13, x);
+    }
+    else {
+        x = f3_sticky_shift(14 - exp, x);
+        exp = 0;
+    }
+
+    error = x.x0 & 3;
+    x.x0 = x.x0 >> 2 | x.x1 << 62;
+    x.x1 = x.x1 >> 2;
+
+    if (error == 3 || ((error == 2) & (x.x0 & 1))) {
+        if (!++x.x0) {
+            ++x.x1;
+            if (x.x1 == (uint64_t)1 << 48)
+                exp = 1;
+            else if (x.x1 == (uint64_t)1 << 49) {
+                ++exp;
+                x.x0 = x.x0 >> 1 | x.x1 << 63;
+                x.x1 = x.x1 >> 1;
+            }
+        }
+    }
+
+    if (exp >= 32767)
+        return f3_infinity(sgn);
+
+    x.x1 = x.x1 << 16 >> 16 | (uint64_t)exp << 48 | (uint64_t)sgn << 63;
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static long double f3_add(long double fa, long double fb, int neg)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    b_sgn ^= neg;
+
+    // Handle infinities and zeroes:
+    if (a_exp == 32767 && b_exp == 32767 && a_sgn != b_sgn)
+        return f3_NaN();
+    if (a_exp == 32767)
+        return f3_infinity(a_sgn);
+    if (b_exp == 32767)
+        return f3_infinity(b_sgn);
+    if (!(a.x0 | a.x1 | b.x0 | b.x1))
+        return f3_zero(a_sgn & b_sgn);
+
+    a.x1 = a.x1 << 3 | a.x0 >> 61;
+    a.x0 = a.x0 << 3;
+    b.x1 = b.x1 << 3 | b.x0 >> 61;
+    b.x0 = b.x0 << 3;
+
+    if (a_exp <= b_exp) {
+        a = f3_sticky_shift(b_exp - a_exp, a);
+        a_exp = b_exp;
+    }
+    else {
+        b = f3_sticky_shift(a_exp - b_exp, b);
+        b_exp = a_exp;
+    }
+
+    x_sgn = a_sgn;
+    x_exp = a_exp;
+    if (a_sgn == b_sgn) {
+        x.x0 = a.x0 + b.x0;
+        x.x1 = a.x1 + b.x1 + (x.x0 < a.x0);
+    }
+    else {
+        x.x0 = a.x0 - b.x0;
+        x.x1 = a.x1 - b.x1 - (x.x0 > a.x0);
+        if (x.x1 >> 63) {
+            x_sgn ^= 1;
+            x.x0 = -x.x0;
+            x.x1 = -x.x1 - !!x.x0;
+        }
+    }
+
+    if (!(x.x0 | x.x1))
+        return f3_zero(0);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp + 12, x);
+}
+
+long double __addtf3(long double a, long double b)
+{
+    return f3_add(a, b, 0);
+}
+
+long double __subtf3(long double a, long double b)
+{
+    return f3_add(a, b, 1);
+}
+
+long double __multf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && !(b.x0 | b.x1)) ||
+        (b_exp == 32767 && !(a.x0 | a.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || b_exp == 32767)
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || !(b.x0 | b.x1))
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp + b_exp - 16352;
+
+    {
+        // Convert to base (1 << 30), discarding bottom 6 bits, which are zero,
+        // so there are (32, 30, 30, 30) bits in (a3, a2, a1, a0):
+        uint64_t a0 = a.x0 << 28 >> 34;
+        uint64_t b0 = b.x0 << 28 >> 34;
+        uint64_t a1 = a.x0 >> 36 | a.x1 << 62 >> 34;
+        uint64_t b1 = b.x0 >> 36 | b.x1 << 62 >> 34;
+        uint64_t a2 = a.x1 << 32 >> 34;
+        uint64_t b2 = b.x1 << 32 >> 34;
+        uint64_t a3 = a.x1 >> 32;
+        uint64_t b3 = b.x1 >> 32;
+        // Use 16 small multiplications and additions that do not overflow:
+        uint64_t x0 = a0 * b0;
+        uint64_t x1 = (x0 >> 30) + a0 * b1 + a1 * b0;
+        uint64_t x2 = (x1 >> 30) + a0 * b2 + a1 * b1 + a2 * b0;
+        uint64_t x3 = (x2 >> 30) + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+        uint64_t x4 = (x3 >> 30) + a1 * b3 + a2 * b2 + a3 * b1;
+        uint64_t x5 = (x4 >> 30) + a2 * b3 + a3 * b2;
+        uint64_t x6 = (x5 >> 30) + a3 * b3;
+        // We now have (64, 30, 30, ...) bits in (x6, x5, x4, ...).
+        // Take the top 128 bits, setting bottom bit if any lower bits were set:
+        uint64_t y0 = (x5 << 34 | x4 << 34 >> 30 | x3 << 34 >> 60 |
+                       !!(x3 << 38 | (x2 | x1 | x0) << 34));
+        uint64_t y1 = x6;
+        // Top bit may be zero. Renormalise:
+        if (!(y1 >> 63)) {
+            y1 = y1 << 1 | y0 >> 63;
+            y0 = y0 << 1;
+            --x_exp;
+        }
+        x.x0 = y0;
+        x.x1 = y1;
+    }
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __divtf3(long double fa, long double fb)
+{
+    u128_t a, b, x;
+    int32_t a_exp, b_exp, x_exp;
+    int a_sgn, b_sgn, x_sgn, i;
+    long double fx;
+
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    f3_unpack(&b_sgn, &b_exp, &b, fb);
+
+    if (fp3_detect_NaNs(&fx, a_sgn, a_exp, a, b_sgn, b_exp, b))
+        return fx;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 32767 && b_exp == 32767) ||
+        (!(a.x0 | a.x1) && !(b.x0 | b.x1)))
+        return f3_NaN();
+    if (a_exp == 32767 || !(b.x0 | b.x1))
+        return f3_infinity(a_sgn ^ b_sgn);
+    if (!(a.x0 | a.x1) || b_exp == 32767)
+        return f3_zero(a_sgn ^ b_sgn);
+
+    a = f3_normalise(&a_exp, a);
+    b = f3_normalise(&b_exp, b);
+
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp - b_exp + 16395;
+
+    a.x0 = a.x0 >> 1 | a.x1 << 63;
+    a.x1 = a.x1 >> 1;
+    b.x0 = b.x0 >> 1 | b.x1 << 63;
+    b.x1 = b.x1 >> 1;
+    x.x0 = 0;
+    x.x1 = 0;
+    for (i = 0; i < 116; i++) {
+        x.x1 = x.x1 << 1 | x.x0 >> 63;
+        x.x0 = x.x0 << 1;
+        if (a.x1 > b.x1 || (a.x1 == b.x1 && a.x0 >= b.x0)) {
+            a.x1 = a.x1 - b.x1 - (a.x0 < b.x0);
+            a.x0 = a.x0 - b.x0;
+            x.x0 |= 1;
+        }
+        a.x1 = a.x1 << 1 | a.x0 >> 63;
+        a.x0 = a.x0 << 1;
+    }
+    x.x0 |= !!(a.x0 | a.x1);
+
+    x = f3_normalise(&x_exp, x);
+
+    return f3_round(x_sgn, x_exp, x);
+}
+
+long double __extendsftf2(float f)
+{
+    long double fx;
+    u128_t x;
+    uint32_t a;
+    uint64_t aa;
+    memcpy(&a, &f, 4);
+    aa = a;
+    x.x0 = 0;
+    if (!(a << 1))
+        x.x1 = aa << 32;
+    else if (a << 1 >> 24 == 255)
+        x.x1 = (0x7fff000000000000 | aa >> 31 << 63 | aa << 41 >> 16 |
+                (uint64_t)!!(a << 9) << 47);
+    else
+        x.x1 = (aa >> 31 << 63 | ((aa >> 23 & 255) + 16256) << 48 |
+                aa << 41 >> 16);
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+long double __extenddftf2(double f)
+{
+    long double fx;
+    u128_t x;
+    uint64_t a;
+    memcpy(&a, &f, 8);
+    x.x0 = a << 60;
+    if (!(a << 1))
+        x.x1 = a;
+    else if (a << 1 >> 53 == 2047)
+        x.x1 = (0x7fff000000000000 | a >> 63 << 63 | a << 12 >> 16 |
+                (uint64_t)!!(a << 12) << 47);
+    else
+        x.x1 = a >> 63 << 63 | ((a >> 52 & 2047) + 15360) << 48 | a << 12 >> 16;
+    memcpy(&fx, &x, 16);
+    return fx;
+}
+
+float __trunctfsf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint32_t x;
+    float fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = 0x7fc00000 | (uint32_t)sgn << 31 | (mnt.x1 >> 25 & 0x007fffff);
+    else if (exp > 16510)
+        x = 0x7f800000 | (uint32_t)sgn << 31;
+    else if (exp < 16233)
+        x = (uint32_t)sgn << 31;
+    else {
+        exp -= 16257;
+        x = mnt.x1 >> 23 | !!(mnt.x0 | mnt.x1 << 41);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (32 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + (exp << 23)) | (uint32_t)sgn << 31;
+    }
+    memcpy(&fx, &x, 4);
+    return fx;
+}
+
+double __trunctfdf2(long double f)
+{
+    u128_t mnt;
+    int32_t exp;
+    int sgn;
+    uint64_t x;
+    double fx;
+
+    f3_unpack(&sgn, &exp, &mnt, f);
+
+    if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
+        x = (0x7ff8000000000000 | (uint64_t)sgn << 63 |
+             mnt.x1 << 16 >> 12 | mnt.x0 >> 60);
+    else if (exp > 17406)
+        x = 0x7ff0000000000000 | (uint64_t)sgn << 63;
+    else if (exp < 15308)
+        x = (uint64_t)sgn << 63;
+    else {
+        exp -= 15361;
+        x = mnt.x1 << 6 | mnt.x0 >> 58 | !!(mnt.x0 << 6);
+        if (exp < 0) {
+            x = x >> -exp | !!(x << (64 + exp));
+            exp = 0;
+        }
+        if ((x & 3) == 3 || (x & 7) == 6)
+            x += 4;
+        x = ((x >> 2) + ((uint64_t)exp << 52)) | (uint64_t)sgn << 63;
+    }
+    memcpy(&fx, &x, 8);
+    return fx;
+}
+
+int32_t __fixtfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int32_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16369)
+        return 0;
+    if (a_exp > 16413)
+        return a_sgn ? -0x80000000 : 0x7fffffff;
+    x = a.x1 >> (16431 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+int64_t __fixtfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    int64_t x;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_exp < 16383)
+        return 0;
+    if (a_exp > 16445)
+        return a_sgn ? -0x8000000000000000 : 0x7fffffffffffffff;
+    x = (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+    return a_sgn ? -x : x;
+}
+
+uint32_t __fixunstfsi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16369)
+        return 0;
+    if (a_exp > 16414)
+        return -1;
+    return a.x1 >> (16431 - a_exp);
+}
+
+uint64_t __fixunstfdi(long double fa)
+{
+    u128_t a;
+    int32_t a_exp;
+    int a_sgn;
+    f3_unpack(&a_sgn, &a_exp, &a, fa);
+    if (a_sgn || a_exp < 16383)
+        return 0;
+    if (a_exp > 16446)
+        return -1;
+    return (a.x1 << 15 | a.x0 >> 49) >> (16446 - a_exp);
+}
+
+long double __floatsitf(int32_t a)
+{
+    int sgn = 0;
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = ((uint64_t)sgn << 63 | (uint64_t)exp << 48 |
+                (uint64_t)(mnt << 1) << 16);
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatditf(int64_t a)
+{
+    int sgn = 0;
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        if (a < 0) {
+            sgn = 1;
+            mnt = -mnt;
+        }
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)sgn << 63 | (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunsitf(uint32_t a)
+{
+    int exp = 16414;
+    uint32_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 16; i; i >>= 1)
+            if (!(mnt >> (32 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x1 = (uint64_t)exp << 48 | (uint64_t)(mnt << 1) << 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+long double __floatunditf(uint64_t a)
+{
+    int exp = 16446;
+    uint64_t mnt = a;
+    u128_t x = { 0, 0 };
+    long double f;
+    int i;
+    if (a) {
+        for (i = 32; i; i >>= 1)
+            if (!(mnt >> (64 - i))) {
+                mnt <<= i;
+                exp -= i;
+            }
+        x.x0 = mnt << 49;
+        x.x1 = (uint64_t)exp << 48 | mnt << 1 >> 16;
+    }
+    memcpy(&f, &x, 16);
+    return f;
+}
+
+static int f3_cmp(long double fa, long double fb)
+{
+    u128_t a, b;
+    memcpy(&a, &fa, 16);
+    memcpy(&b, &fb, 16);
+    return (!(a.x0 | a.x1 << 1 | b.x0 | b.x1 << 1) ? 0 :
+            ((a.x1 << 1 >> 49 == 0x7fff && (a.x0 | a.x1 << 16)) ||
+             (b.x1 << 1 >> 49 == 0x7fff && (b.x0 | b.x1 << 16))) ? 2 :
+            a.x1 >> 63 != b.x1 >> 63 ? (int)(b.x1 >> 63) - (int)(a.x1 >> 63) :
+            a.x1 < b.x1 ? (int)(a.x1 >> 63 << 1) - 1 :
+            a.x1 > b.x1 ? 1 - (int)(a.x1 >> 63 << 1) :
+            a.x0 < b.x0 ? (int)(a.x1 >> 63 << 1) - 1 :
+            b.x0 < a.x0 ? 1 - (int)(a.x1 >> 63 << 1) : 0);
+}
+
+int __eqtf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __netf2(long double a, long double b)
+{
+    return !!f3_cmp(a, b);
+}
+
+int __lttf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __letf2(long double a, long double b)
+{
+    return f3_cmp(a, b);
+}
+
+int __gttf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
+
+int __getf2(long double a, long double b)
+{
+    return -f3_cmp(b, a);
+}
diff --git a/tinyc/lib/libtcc1.c b/tinyc/lib/libtcc1.c
index b079477e4..0e466180c 100644
--- a/tinyc/lib/libtcc1.c
+++ b/tinyc/lib/libtcc1.c
@@ -103,14 +103,14 @@ union double_long {
 
 union float_long {
     float f;
-    long l;
+    unsigned int l;
 };
 
 /* XXX: we don't support several builtin supports for now */
-#ifndef __x86_64__
+#if !defined __x86_64__ && !defined __arm__
 
 /* XXX: use gcc/tcc intrinsic ? */
-#if defined(__i386__)
+#if defined __i386__
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   __asm__ ("subl %5,%1\n\tsbbl %3,%0"					\
 	   : "=r" ((USItype) (sh)),					\
@@ -162,7 +162,7 @@ static UDWtype __udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
   n0 = nn.s.low;
   n1 = nn.s.high;
 
-#if !UDIV_NEEDS_NORMALIZATION
+#if !defined(UDIV_NEEDS_NORMALIZATION)
   if (d1 == 0)
     {
       if (d0 > n1)
@@ -478,13 +478,6 @@ long long __ashldi3(long long a, int b)
 #endif
 }
 
-#if defined(__i386__)
-/* FPU control word for rounding to nearest mode */
-unsigned short __tcc_fpu_control = 0x137f;
-/* FPU control word for round to zero mode for int conversion */
-unsigned short __tcc_int_fpu_control = 0x137f | 0x0c00;
-#endif
-
 #endif /* !__x86_64__ */
 
 /* XXX: fix tcc's code generator to do this instead */
@@ -557,6 +550,13 @@ unsigned long long __fixunssfdi (float a1)
         return 0;
 }
 
+long long __fixsfdi (float a1)
+{
+    long long ret; int s;
+    ret = __fixunssfdi((s = a1 >= 0) ? a1 : -a1);
+    return s ? ret : -ret;
+}
+
 unsigned long long __fixunsdfdi (double a1)
 {
     register union double_long dl1;
@@ -582,6 +582,14 @@ unsigned long long __fixunsdfdi (double a1)
         return 0;
 }
 
+long long __fixdfdi (double a1)
+{
+    long long ret; int s;
+    ret = __fixunsdfdi((s = a1 >= 0) ? a1 : -a1);
+    return s ? ret : -ret;
+}
+
+#ifndef __arm__
 unsigned long long __fixunsxfdi (long double a1)
 {
     register union ldouble_long dl1;
@@ -605,3 +613,10 @@ unsigned long long __fixunsxfdi (long double a1)
         return 0;
 }
 
+long long __fixxfdi (long double a1)
+{
+    long long ret; int s;
+    ret = __fixunsxfdi((s = a1 >= 0) ? a1 : -a1);
+    return s ? ret : -ret;
+}
+#endif /* !ARM */
diff --git a/tinyc/lib/va_list.c b/tinyc/lib/va_list.c
new file mode 100644
index 000000000..8749f46f8
--- /dev/null
+++ b/tinyc/lib/va_list.c
@@ -0,0 +1,65 @@
+/* va_list.c - tinycc support for va_list on X86_64 */
+
+#if defined __x86_64__
+
+/* Avoid include files, they may not be available when cross compiling */
+extern void *memset(void *s, int c, __SIZE_TYPE__ n);
+extern void abort(void);
+
+/* This should be in sync with our include/stdarg.h */
+enum __va_arg_type {
+    __va_gen_reg, __va_float_reg, __va_stack
+};
+
+/* GCC compatible definition of va_list. */
+typedef struct {
+    unsigned int gp_offset;
+    unsigned int fp_offset;
+    union {
+        unsigned int overflow_offset;
+        char *overflow_arg_area;
+    };
+    char *reg_save_area;
+} __va_list_struct;
+
+void __va_start(__va_list_struct *ap, void *fp)
+{
+    memset(ap, 0, sizeof(__va_list_struct));
+    *ap = *(__va_list_struct *)((char *)fp - 16);
+    ap->overflow_arg_area = (char *)fp + ap->overflow_offset;
+    ap->reg_save_area = (char *)fp - 176 - 16;
+}
+
+void *__va_arg(__va_list_struct *ap,
+               enum __va_arg_type arg_type,
+               int size, int align)
+{
+    size = (size + 7) & ~7;
+    align = (align + 7) & ~7;
+    switch (arg_type) {
+    case __va_gen_reg:
+        if (ap->gp_offset + size <= 48) {
+            ap->gp_offset += size;
+            return ap->reg_save_area + ap->gp_offset - size;
+        }
+        goto use_overflow_area;
+
+    case __va_float_reg:
+        if (ap->fp_offset < 128 + 48) {
+            ap->fp_offset += 16;
+            return ap->reg_save_area + ap->fp_offset - 16;
+        }
+        size = 8;
+        goto use_overflow_area;
+
+    case __va_stack:
+    use_overflow_area:
+        ap->overflow_arg_area += size;
+        ap->overflow_arg_area = (char*)((long long)(ap->overflow_arg_area + align - 1) & -align);
+        return ap->overflow_arg_area - size;
+
+    default: /* should never happen */
+        abort();
+    }
+}
+#endif