summary refs log tree commit diff stats
path: root/tinyc/i386-asm.c
diff options
context:
space:
mode:
authorDmitry Atamanov <data-man@users.noreply.github.com>2017-10-28 10:25:56 +0300
committerAndreas Rumpf <rumpf_a@web.de>2017-10-28 09:25:56 +0200
commitd2c7d391c8b69a6a590a2f702ed58bea033f6325 (patch)
treec74a1b46e1166ddb87453ddc49cea84e1baaa5ab /tinyc/i386-asm.c
parent9c00f6decd4453a4233450a60ccef05b20e9f24a (diff)
downloadNim-d2c7d391c8b69a6a590a2f702ed58bea033f6325.tar.gz
TinyC upgrade (#6593)
Diffstat (limited to 'tinyc/i386-asm.c')
-rw-r--r--tinyc/i386-asm.c1145
1 files changed, 824 insertions, 321 deletions
diff --git a/tinyc/i386-asm.c b/tinyc/i386-asm.c
index 12ff8f2ba..2e184974e 100644
--- a/tinyc/i386-asm.c
+++ b/tinyc/i386-asm.c
@@ -1,7 +1,8 @@
 /*
  *  i386 specific functions for TCC assembler
- * 
+ *
  *  Copyright (c) 2001, 2002 Fabrice Bellard
+ *  Copyright (c) 2009 Frédéric Feret (x86_64 support)
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -18,66 +19,88 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include "tcc.h"
+
 #define MAX_OPERANDS 3
 
-typedef struct ASMInstr {
-    uint16_t sym;
-    uint16_t opcode;
-    uint16_t instr_type;
-#define OPC_JMP       0x01  /* jmp operand */
-#define OPC_B         0x02  /* only used zith OPC_WL */
-#define OPC_WL        0x04  /* accepts w, l or no suffix */
-#define OPC_BWL       (OPC_B | OPC_WL) /* accepts b, w, l or no suffix */
-#define OPC_REG       0x08 /* register is added to opcode */
-#define OPC_MODRM     0x10 /* modrm encoding */
-#define OPC_FWAIT     0x20 /* add fwait opcode */
-#define OPC_TEST      0x40 /* test opcodes */
-#define OPC_SHIFT     0x80 /* shift opcodes */
-#define OPC_D16      0x0100 /* generate data16 prefix */
-#define OPC_ARITH    0x0200 /* arithmetic opcodes */
-#define OPC_SHORTJMP 0x0400 /* short jmp operand */
-#define OPC_FARITH   0x0800 /* FPU arithmetic opcodes */
+#define TOK_ASM_first TOK_ASM_clc
+#define TOK_ASM_last TOK_ASM_emms
+#define TOK_ASM_alllast TOK_ASM_subps
+
+#define OPC_B          0x01  /* only used with OPC_WL */
+#define OPC_WL         0x02  /* accepts w, l or no suffix */
+#define OPC_BWL        (OPC_B | OPC_WL) /* accepts b, w, l or no suffix */
+#define OPC_REG        0x04 /* register is added to opcode */
+#define OPC_MODRM      0x08 /* modrm encoding */
+
+#define OPCT_MASK      0x70
+#define OPC_FWAIT      0x10 /* add fwait opcode */
+#define OPC_SHIFT      0x20 /* shift opcodes */
+#define OPC_ARITH      0x30 /* arithmetic opcodes */
+#define OPC_FARITH     0x40 /* FPU arithmetic opcodes */
+#define OPC_TEST       0x50 /* test opcodes */
+#define OPCT_IS(v,i) (((v) & OPCT_MASK) == (i))
+
+#define OPC_0F        0x100 /* Is secondary map (0x0f prefix) */
+#define OPC_48        0x200 /* Always has REX prefix */
+#ifdef TCC_TARGET_X86_64
+# define OPC_WLQ     0x1000  /* accepts w, l, q or no suffix */
+# define OPC_BWLQ    (OPC_B | OPC_WLQ) /* accepts b, w, l, q or no suffix */
+# define OPC_WLX     OPC_WLQ
+# define OPC_BWLX    OPC_BWLQ
+#else
+# define OPC_WLX     OPC_WL
+# define OPC_BWLX    OPC_BWL
+#endif
+
 #define OPC_GROUP_SHIFT 13
 
 /* in order to compress the operand type, we use specific operands and
-   we or only with EA  */ 
-#define OPT_REG8  0 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_REG16 1 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_REG32 2 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_MMX   3 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_SSE   4 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_CR    5 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_TR    6 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_DB    7 /* warning: value is hardcoded from TOK_ASM_xxx */
-#define OPT_SEG   8
-#define OPT_ST    9
-#define OPT_IM8   10
-#define OPT_IM8S  11
-#define OPT_IM16  12
-#define OPT_IM32  13
-#define OPT_EAX   14 /* %al, %ax or %eax register */
-#define OPT_ST0   15 /* %st(0) register */
-#define OPT_CL    16 /* %cl register */
-#define OPT_DX    17 /* %dx register */
-#define OPT_ADDR  18 /* OP_EA with only offset */
-#define OPT_INDIR 19 /* *(expr) */
-
-/* composite types */ 
-#define OPT_COMPOSITE_FIRST   20
-#define OPT_IM       20 /* IM8 | IM16 | IM32 */
-#define OPT_REG      21 /* REG8 | REG16 | REG32 */ 
-#define OPT_REGW     22 /* REG16 | REG32 */
-#define OPT_IMW      23 /* IM16 | IM32 */ 
-
-/* can be ored with any OPT_xxx */
-#define OPT_EA    0x80
-
-    uint8_t nb_ops;
-    uint8_t op_type[MAX_OPERANDS]; /* see OP_xxx */
-} ASMInstr;
+   we or only with EA  */
+enum {
+    OPT_REG8=0, /* warning: value is hardcoded from TOK_ASM_xxx */
+    OPT_REG16,  /* warning: value is hardcoded from TOK_ASM_xxx */
+    OPT_REG32,  /* warning: value is hardcoded from TOK_ASM_xxx */
+#ifdef TCC_TARGET_X86_64
+    OPT_REG64,  /* warning: value is hardcoded from TOK_ASM_xxx */
+#endif
+    OPT_MMX,    /* warning: value is hardcoded from TOK_ASM_xxx */
+    OPT_SSE,    /* warning: value is hardcoded from TOK_ASM_xxx */
+    OPT_CR,     /* warning: value is hardcoded from TOK_ASM_xxx */
+    OPT_TR,     /* warning: value is hardcoded from TOK_ASM_xxx */
+    OPT_DB,     /* warning: value is hardcoded from TOK_ASM_xxx */
+    OPT_SEG,
+    OPT_ST,
+#ifdef TCC_TARGET_X86_64
+    OPT_REG8_LOW, /* %spl,%bpl,%sil,%dil, encoded like ah,ch,dh,bh, but
+		     with REX prefix, not used in insn templates */
+#endif
+    OPT_IM8,
+    OPT_IM8S,
+    OPT_IM16,
+    OPT_IM32,
+#ifdef TCC_TARGET_X86_64
+    OPT_IM64,
+#endif
+    OPT_EAX,    /* %al, %ax, %eax or %rax register */
+    OPT_ST0,    /* %st(0) register */
+    OPT_CL,     /* %cl register */
+    OPT_DX,     /* %dx register */
+    OPT_ADDR,   /* OP_EA with only offset */
+    OPT_INDIR,  /* *(expr) */
+    /* composite types */
+    OPT_COMPOSITE_FIRST,
+    OPT_IM,     /* IM8 | IM16 | IM32 */
+    OPT_REG,    /* REG8 | REG16 | REG32 | REG64 */
+    OPT_REGW,   /* REG16 | REG32 | REG64 */
+    OPT_IMW,    /* IM16 | IM32 */
+    OPT_MMXSSE, /* MMX | SSE */
+    OPT_DISP,   /* Like OPT_ADDR, but emitted as displacement (for jumps) */
+    OPT_DISP8,  /* Like OPT_ADDR, but only 8bit (short jumps) */
+    /* can be ored with any OPT_xxx */
+    OPT_EA = 0x80
+};
 
-typedef struct Operand {
-    uint32_t type;
 #define OP_REG8   (1 << OPT_REG8)
 #define OP_REG16  (1 << OPT_REG16)
 #define OP_REG32  (1 << OPT_REG32)
@@ -98,26 +121,58 @@ typedef struct Operand {
 #define OP_DX     (1 << OPT_DX)
 #define OP_ADDR   (1 << OPT_ADDR)
 #define OP_INDIR  (1 << OPT_INDIR)
+#ifdef TCC_TARGET_X86_64
+# define OP_REG64 (1 << OPT_REG64)
+# define OP_REG8_LOW (1 << OPT_REG8_LOW)
+# define OP_IM64  (1 << OPT_IM64)
+# define OP_EA32  (OP_EA << 1)
+#else
+# define OP_REG64 0
+# define OP_REG8_LOW 0
+# define OP_IM64  0
+# define OP_EA32  0
+#endif
 
 #define OP_EA     0x40000000
-#define OP_REG    (OP_REG8 | OP_REG16 | OP_REG32)
-#define OP_IM     OP_IM32
+#define OP_REG    (OP_REG8 | OP_REG16 | OP_REG32 | OP_REG64)
+
+#ifdef TCC_TARGET_X86_64
+# define TREG_XAX   TREG_RAX
+# define TREG_XCX   TREG_RCX
+# define TREG_XDX   TREG_RDX
+#else
+# define TREG_XAX   TREG_EAX
+# define TREG_XCX   TREG_ECX
+# define TREG_XDX   TREG_EDX
+#endif
+
+typedef struct ASMInstr {
+    uint16_t sym;
+    uint16_t opcode;
+    uint16_t instr_type;
+    uint8_t nb_ops;
+    uint8_t op_type[MAX_OPERANDS]; /* see OP_xxx */
+} ASMInstr;
+
+typedef struct Operand {
+    uint32_t type;
     int8_t  reg; /* register, -1 if none */
     int8_t  reg2; /* second register, -1 if none */
     uint8_t shift;
     ExprValue e;
 } Operand;
 
-static const uint8_t reg_to_size[5] = {
+static const uint8_t reg_to_size[9] = {
 /*
     [OP_REG8] = 0,
     [OP_REG16] = 1,
     [OP_REG32] = 2,
+#ifdef TCC_TARGET_X86_64
+    [OP_REG64] = 3,
+#endif
 */
-    0, 0, 1, 0, 2
+    0, 0, 1, 0, 2, 0, 0, 0, 3
 };
-    
-#define WORD_PREFIX_OPCODE 0x66
 
 #define NB_TEST_OPCODES 30
 
@@ -165,13 +220,20 @@ static const uint8_t segment_prefixes[] = {
 
 static const ASMInstr asm_instrs[] = {
 #define ALT(x) x
+/* This removes a 0x0f in the second byte */
+#define O(o) ((uint64_t) ((((o) & 0xff00) == 0x0f00) ? ((((o) >> 8) & ~0xff) | ((o) & 0xff)) : (o)))
+/* This constructs instr_type from opcode, type and group.  */
+#define T(o,i,g) ((i) | ((g) << OPC_GROUP_SHIFT) | ((((o) & 0xff00) == 0x0f00) ? OPC_0F : 0))
 #define DEF_ASM_OP0(name, opcode)
-#define DEF_ASM_OP0L(name, opcode, group, instr_type) { TOK_ASM_ ## name, opcode, (instr_type | group << OPC_GROUP_SHIFT), 0 },
-#define DEF_ASM_OP1(name, opcode, group, instr_type, op0) { TOK_ASM_ ## name, opcode, (instr_type | group << OPC_GROUP_SHIFT), 1, { op0 }},
-#define DEF_ASM_OP2(name, opcode, group, instr_type, op0, op1) { TOK_ASM_ ## name, opcode, (instr_type | group << OPC_GROUP_SHIFT), 2, { op0, op1 }},
-#define DEF_ASM_OP3(name, opcode, group, instr_type, op0, op1, op2) { TOK_ASM_ ## name, opcode, (instr_type | group << OPC_GROUP_SHIFT), 3, { op0, op1, op2 }},
-#include "i386-asm.h"
-
+#define DEF_ASM_OP0L(name, opcode, group, instr_type) { TOK_ASM_ ## name, O(opcode), T(opcode, instr_type, group), 0, { 0 } },
+#define DEF_ASM_OP1(name, opcode, group, instr_type, op0) { TOK_ASM_ ## name, O(opcode), T(opcode, instr_type, group), 1, { op0 }},
+#define DEF_ASM_OP2(name, opcode, group, instr_type, op0, op1) { TOK_ASM_ ## name, O(opcode), T(opcode, instr_type, group), 2, { op0, op1 }},
+#define DEF_ASM_OP3(name, opcode, group, instr_type, op0, op1, op2) { TOK_ASM_ ## name, O(opcode), T(opcode, instr_type, group), 3, { op0, op1, op2 }},
+#ifdef TCC_TARGET_X86_64
+# include "x86_64-asm.h"
+#else
+# include "i386-asm.h"
+#endif
     /* last operation */
     { 0, },
 };
@@ -183,13 +245,16 @@ static const uint16_t op0_codes[] = {
 #define DEF_ASM_OP1(name, opcode, group, instr_type, op0)
 #define DEF_ASM_OP2(name, opcode, group, instr_type, op0, op1)
 #define DEF_ASM_OP3(name, opcode, group, instr_type, op0, op1, op2)
-#include "i386-asm.h"
+#ifdef TCC_TARGET_X86_64
+# include "x86_64-asm.h"
+#else
+# include "i386-asm.h"
+#endif
 };
 
 static inline int get_reg_shift(TCCState *s1)
 {
     int shift, v;
-
     v = asm_int_expr(s1);
     switch(v) {
     case 1:
@@ -212,21 +277,73 @@ static inline int get_reg_shift(TCCState *s1)
     return shift;
 }
 
-static int asm_parse_reg(void)
+#ifdef TCC_TARGET_X86_64
+static int asm_parse_numeric_reg(int t, unsigned int *type)
 {
-    int reg;
+    int reg = -1;
+    if (t >= TOK_IDENT && t < tok_ident) {
+	const char *s = table_ident[t - TOK_IDENT]->str;
+	char c;
+	*type = OP_REG64;
+	if (*s == 'c') {
+	    s++;
+	    *type = OP_CR;
+	}
+	if (*s++ != 'r')
+	  return -1;
+	/* Don't allow leading '0'.  */
+	if ((c = *s++) >= '1' && c <= '9')
+	  reg = c - '0';
+	else
+	  return -1;
+	if ((c = *s) >= '0' && c <= '5')
+	  s++, reg = reg * 10 + c - '0';
+	if (reg > 15)
+	  return -1;
+	if ((c = *s) == 0)
+	  ;
+	else if (*type != OP_REG64)
+	  return -1;
+	else if (c == 'b' && !s[1])
+	  *type = OP_REG8;
+	else if (c == 'w' && !s[1])
+	  *type = OP_REG16;
+	else if (c == 'd' && !s[1])
+	  *type = OP_REG32;
+	else
+	  return -1;
+    }
+    return reg;
+}
+#endif
+
+static int asm_parse_reg(unsigned int *type)
+{
+    int reg = 0;
+    *type = 0;
     if (tok != '%')
         goto error_32;
     next();
     if (tok >= TOK_ASM_eax && tok <= TOK_ASM_edi) {
         reg = tok - TOK_ASM_eax;
-        next();
-        return reg;
+	*type = OP_REG32;
+#ifdef TCC_TARGET_X86_64
+    } else if (tok >= TOK_ASM_rax && tok <= TOK_ASM_rdi) {
+        reg = tok - TOK_ASM_rax;
+	*type = OP_REG64;
+    } else if (tok == TOK_ASM_rip) {
+        reg = -2; /* Probably should use different escape code. */
+	*type = OP_REG64;
+    } else if ((reg = asm_parse_numeric_reg(tok, type)) >= 0
+	       && (*type == OP_REG32 || *type == OP_REG64)) {
+	;
+#endif
     } else {
     error_32:
-        expect("32 bit register");
-        return 0;
+        expect("register");
     }
+    next();
+    return reg;
 }
 
 static void parse_operand(TCCState *s1, Operand *op)
@@ -247,11 +364,11 @@ static void parse_operand(TCCState *s1, Operand *op)
             reg = tok - TOK_ASM_al;
             op->type = 1 << (reg >> 3); /* WARNING: do not change constant order */
             op->reg = reg & 7;
-            if ((op->type & OP_REG) && op->reg == TREG_EAX)
+            if ((op->type & OP_REG) && op->reg == TREG_XAX)
                 op->type |= OP_EAX;
-            else if (op->type == OP_REG8 && op->reg == TREG_ECX)
+            else if (op->type == OP_REG8 && op->reg == TREG_XCX)
                 op->type |= OP_CL;
-            else if (op->type == OP_REG16 && op->reg == TREG_EDX)
+            else if (op->type == OP_REG16 && op->reg == TREG_XDX)
                 op->type |= OP_DX;
         } else if (tok >= TOK_ASM_dr0 && tok <= TOK_ASM_dr7) {
             op->type = OP_DB;
@@ -267,7 +384,7 @@ static void parse_operand(TCCState *s1, Operand *op)
                 next();
                 if (tok != TOK_PPNUM)
                     goto reg_error;
-                p = tokc.cstr->data;
+                p = tokc.str.data;
                 reg = p[0] - '0';
                 if ((unsigned)reg >= 8 || p[1] != '\0')
                     goto reg_error;
@@ -278,9 +395,16 @@ static void parse_operand(TCCState *s1, Operand *op)
             if (op->reg == 0)
                 op->type |= OP_ST0;
             goto no_skip;
+#ifdef TCC_TARGET_X86_64
+	} else if (tok >= TOK_ASM_spl && tok <= TOK_ASM_dil) {
+	    op->type = OP_REG8 | OP_REG8_LOW;
+	    op->reg = 4 + tok - TOK_ASM_spl;
+        } else if ((op->reg = asm_parse_numeric_reg(tok, &op->type)) >= 0) {
+	    ;
+#endif
         } else {
         reg_error:
-            error("unknown register");
+            tcc_error("unknown register %%%s", get_tok_str(tok, &tokc));
         }
         next();
     no_skip: ;
@@ -289,8 +413,7 @@ static void parse_operand(TCCState *s1, Operand *op)
         next();
         asm_expr(s1, &e);
         op->type = OP_IM32;
-        op->e.v = e.v;
-        op->e.sym = e.sym;
+        op->e = e;
         if (!op->e.sym) {
             if (op->e.v == (uint8_t)op->e.v)
                 op->type |= OP_IM8;
@@ -298,6 +421,10 @@ static void parse_operand(TCCState *s1, Operand *op)
                 op->type |= OP_IM8S;
             if (op->e.v == (uint16_t)op->e.v)
                 op->type |= OP_IM16;
+#ifdef TCC_TARGET_X86_64
+            if (op->e.v != (int32_t)op->e.v && op->e.v != (uint32_t)op->e.v)
+                op->type = OP_IM64;
+#endif
         }
     } else {
         /* address(reg,reg2,shift) with all variants */
@@ -307,27 +434,42 @@ static void parse_operand(TCCState *s1, Operand *op)
         op->shift = 0;
         if (tok != '(') {
             asm_expr(s1, &e);
-            op->e.v = e.v;
-            op->e.sym = e.sym;
+            op->e = e;
         } else {
-            op->e.v = 0;
-            op->e.sym = NULL;
+            next();
+            if (tok == '%') {
+                unget_tok('(');
+                op->e.v = 0;
+                op->e.sym = NULL;
+            } else {
+                /* bracketed offset expression */
+                asm_expr(s1, &e);
+                if (tok != ')')
+                    expect(")");
+                next();
+                op->e.v = e.v;
+                op->e.sym = e.sym;
+            }
+	    op->e.pcrel = 0;
         }
         if (tok == '(') {
+	    unsigned int type = 0;
             next();
             if (tok != ',') {
-                op->reg = asm_parse_reg();
+                op->reg = asm_parse_reg(&type);
             }
             if (tok == ',') {
                 next();
                 if (tok != ',') {
-                    op->reg2 = asm_parse_reg();
-                } 
+                    op->reg2 = asm_parse_reg(&type);
+                }
                 if (tok == ',') {
                     next();
                     op->shift = get_reg_shift(s1);
                 }
             }
+	    if (type & OP_REG32)
+	        op->type |= OP_EA32;
             skip(')');
         }
         if (op->reg == -1 && op->reg2 == -1)
@@ -337,46 +479,44 @@ static void parse_operand(TCCState *s1, Operand *op)
 }
 
 /* XXX: unify with C code output ? */
-static void gen_expr32(ExprValue *pe)
+ST_FUNC void gen_expr32(ExprValue *pe)
+{
+    if (pe->pcrel)
+        /* If PC-relative, always set VT_SYM, even without symbol,
+	   so as to force a relocation to be emitted.  */
+	gen_addrpc32(VT_SYM, pe->sym, pe->v);
+    else
+	gen_addr32(pe->sym ? VT_SYM : 0, pe->sym, pe->v);
+}
+
+#ifdef TCC_TARGET_X86_64
+ST_FUNC void gen_expr64(ExprValue *pe)
 {
-    if (pe->sym)
-        greloc(cur_text_section, pe->sym, ind, R_386_32);
-    gen_le32(pe->v);
+    gen_addr64(pe->sym ? VT_SYM : 0, pe->sym, pe->v);
 }
+#endif
 
 /* XXX: unify with C code output ? */
 static void gen_disp32(ExprValue *pe)
 {
-    Sym *sym;
-    sym = pe->sym;
-    if (sym) {
-        if (sym->r == cur_text_section->sh_num) {
-            /* same section: we can output an absolute value. Note
-               that the TCC compiler behaves differently here because
-               it always outputs a relocation to ease (future) code
-               elimination in the linker */
-            gen_le32(pe->v + (long)sym->next - ind - 4);
-        } else {
-            greloc(cur_text_section, sym, ind, R_386_PC32);
-            gen_le32(pe->v - 4);
-        }
+    Sym *sym = pe->sym;
+    if (sym && sym->r == cur_text_section->sh_num) {
+        /* same section: we can output an absolute value. Note
+           that the TCC compiler behaves differently here because
+           it always outputs a relocation to ease (future) code
+           elimination in the linker */
+        gen_le32(pe->v + sym->jnext - ind - 4);
     } else {
-        /* put an empty PC32 relocation */
-        put_elf_reloc(symtab_section, cur_text_section, 
-                      ind, R_386_PC32, 0);
-        gen_le32(pe->v - 4);
+        if (sym && sym->type.t == VT_VOID) {
+            sym->type.t = VT_FUNC;
+            sym->type.ref = NULL;
+        }
+        gen_addrpc32(VT_SYM, sym, pe->v);
     }
 }
 
-
-static void gen_le16(int v)
-{
-    g(v);
-    g(v >> 8);
-}
-
 /* generate the modrm operand */
-static inline void asm_modrm(int reg, Operand *op)
+static inline int asm_modrm(int reg, Operand *op)
 {
     int mod, reg1, reg2, sib_reg1;
 
@@ -384,8 +524,20 @@ static inline void asm_modrm(int reg, Operand *op)
         g(0xc0 + (reg << 3) + op->reg);
     } else if (op->reg == -1 && op->reg2 == -1) {
         /* displacement only */
+#ifdef TCC_TARGET_X86_64
+	g(0x04 + (reg << 3));
+	g(0x25);
+#else
+	g(0x05 + (reg << 3));
+#endif
+	gen_expr32(&op->e);
+#ifdef TCC_TARGET_X86_64
+    } else if (op->reg == -2) {
+        ExprValue *pe = &op->e;
         g(0x05 + (reg << 3));
-        gen_expr32(&op->e);
+        gen_addrpc32(pe->sym ? VT_SYM : 0, pe->sym, pe->v);
+        return ind;
+#endif
     } else {
         sib_reg1 = op->reg;
         /* fist compute displacement encoding */
@@ -411,44 +563,158 @@ static inline void asm_modrm(int reg, Operand *op)
                 reg2 = 4; /* indicate no index */
             g((op->shift << 6) + (reg2 << 3) + sib_reg1);
         }
-
         /* add offset */
         if (mod == 0x40) {
             g(op->e.v);
         } else if (mod == 0x80 || op->reg == -1) {
-            gen_expr32(&op->e);
+	    gen_expr32(&op->e);
+        }
+    }
+    return 0;
+}
+
+#ifdef TCC_TARGET_X86_64
+#define REX_W 0x48
+#define REX_R 0x44
+#define REX_X 0x42
+#define REX_B 0x41
+
+static void asm_rex(int width64, Operand *ops, int nb_ops, int *op_type,
+		    int regi, int rmi)
+{
+  unsigned char rex = width64 ? 0x48 : 0;
+  int saw_high_8bit = 0;
+  int i;
+  if (rmi == -1) {
+      /* No mod/rm byte, but we might have a register op nevertheless
+         (we will add it to the opcode later).  */
+      for(i = 0; i < nb_ops; i++) {
+	  if (op_type[i] & (OP_REG | OP_ST)) {
+	      if (ops[i].reg >= 8) {
+		  rex |= REX_B;
+		  ops[i].reg -= 8;
+	      } else if (ops[i].type & OP_REG8_LOW)
+		  rex |= 0x40;
+	      else if (ops[i].type & OP_REG8 && ops[i].reg >= 4)
+		  /* An 8 bit reg >= 4 without REG8 is ah/ch/dh/bh */
+		  saw_high_8bit = ops[i].reg;
+	      break;
+	  }
+      }
+  } else {
+      if (regi != -1) {
+	  if (ops[regi].reg >= 8) {
+	      rex |= REX_R;
+	      ops[regi].reg -= 8;
+	  } else if (ops[regi].type & OP_REG8_LOW)
+	      rex |= 0x40;
+	  else if (ops[regi].type & OP_REG8 && ops[regi].reg >= 4)
+	      /* An 8 bit reg >= 4 without REG8 is ah/ch/dh/bh */
+	      saw_high_8bit = ops[regi].reg;
+      }
+      if (ops[rmi].type & (OP_REG | OP_MMX | OP_SSE | OP_CR | OP_EA)) {
+	  if (ops[rmi].reg >= 8) {
+	      rex |= REX_B;
+	      ops[rmi].reg -= 8;
+	  } else if (ops[rmi].type & OP_REG8_LOW)
+	      rex |= 0x40;
+	  else if (ops[rmi].type & OP_REG8 && ops[rmi].reg >= 4)
+	      /* An 8 bit reg >= 4 without REG8 is ah/ch/dh/bh */
+	      saw_high_8bit = ops[rmi].reg;
+      }
+      if (ops[rmi].type & OP_EA && ops[rmi].reg2 >= 8) {
+	  rex |= REX_X;
+	  ops[rmi].reg2 -= 8;
+      }
+  }
+  if (rex) {
+      if (saw_high_8bit)
+	  tcc_error("can't encode register %%%ch when REX prefix is required",
+		    "acdb"[saw_high_8bit-4]);
+      g(rex);
+  }
+}
+#endif
+
+static void maybe_print_stats (void)
+{
+  static int already = 1;
+  if (!already)
+    /* print stats about opcodes */
+    {
+        const struct ASMInstr *pa;
+        int freq[4];
+        int op_vals[500];
+        int nb_op_vals, i, j;
+
+	already = 1;
+        nb_op_vals = 0;
+        memset(freq, 0, sizeof(freq));
+        for(pa = asm_instrs; pa->sym != 0; pa++) {
+            freq[pa->nb_ops]++;
+            //for(i=0;i<pa->nb_ops;i++) {
+                for(j=0;j<nb_op_vals;j++) {
+                    //if (pa->op_type[i] == op_vals[j])
+                    if (pa->instr_type == op_vals[j])
+                        goto found;
+                }
+                //op_vals[nb_op_vals++] = pa->op_type[i];
+                op_vals[nb_op_vals++] = pa->instr_type;
+            found: ;
+            //}
+        }
+        for(i=0;i<nb_op_vals;i++) {
+            int v = op_vals[i];
+            //if ((v & (v - 1)) != 0)
+                printf("%3d: %08x\n", i, v);
         }
+        printf("size=%d nb=%d f0=%d f1=%d f2=%d f3=%d\n",
+               (int)sizeof(asm_instrs),
+	       (int)sizeof(asm_instrs) / (int)sizeof(ASMInstr),
+               freq[0], freq[1], freq[2], freq[3]);
     }
 }
 
-static void asm_opcode(TCCState *s1, int opcode)
+ST_FUNC void asm_opcode(TCCState *s1, int opcode)
 {
     const ASMInstr *pa;
-    int i, modrm_index, reg, v, op1, is_short_jmp, seg_prefix;
-    int nb_ops, s, ss;
+    int i, modrm_index, modreg_index, reg, v, op1, seg_prefix, pc;
+    int nb_ops, s;
     Operand ops[MAX_OPERANDS], *pop;
     int op_type[3]; /* decoded op type */
+    int alltypes;   /* OR of all operand types */
+    int autosize;
+    int p66;
+#ifdef TCC_TARGET_X86_64
+    int rex64;
+#endif
+
+    maybe_print_stats();
+    /* force synthetic ';' after prefix instruction, so we can handle */
+    /* one-line things like "rep stosb" instead of only "rep\nstosb" */
+    if (opcode >= TOK_ASM_wait && opcode <= TOK_ASM_repnz)
+        unget_tok(';');
 
     /* get operands */
     pop = ops;
     nb_ops = 0;
     seg_prefix = 0;
+    alltypes = 0;
     for(;;) {
         if (tok == ';' || tok == TOK_LINEFEED)
             break;
         if (nb_ops >= MAX_OPERANDS) {
-            error("incorrect number of operands");
+            tcc_error("incorrect number of operands");
         }
         parse_operand(s1, pop);
         if (tok == ':') {
-           if (pop->type != OP_SEG || seg_prefix) {
-               error("incorrect prefix");
-           }
+           if (pop->type != OP_SEG || seg_prefix)
+               tcc_error("incorrect prefix");
            seg_prefix = segment_prefixes[pop->reg];
            next();
            parse_operand(s1, pop);
            if (!(pop->type & OP_EA)) {
-               error("segment prefix must be followed by memory reference");
+               tcc_error("segment prefix must be followed by memory reference");
            }
         }
         pop++;
@@ -458,35 +724,55 @@ static void asm_opcode(TCCState *s1, int opcode)
         next();
     }
 
-    is_short_jmp = 0;
     s = 0; /* avoid warning */
-    
+
     /* optimize matching by using a lookup table (no hashing is needed
        !) */
     for(pa = asm_instrs; pa->sym != 0; pa++) {
+	int it = pa->instr_type & OPCT_MASK;
         s = 0;
-        if (pa->instr_type & OPC_FARITH) {
+        if (it == OPC_FARITH) {
             v = opcode - pa->sym;
             if (!((unsigned)v < 8 * 6 && (v % 6) == 0))
                 continue;
-        } else if (pa->instr_type & OPC_ARITH) {
-            if (!(opcode >= pa->sym && opcode < pa->sym + 8 * 4))
+        } else if (it == OPC_ARITH) {
+            if (!(opcode >= pa->sym && opcode < pa->sym + 8*NBWLX))
                 continue;
-            goto compute_size;
-        } else if (pa->instr_type & OPC_SHIFT) {
-            if (!(opcode >= pa->sym && opcode < pa->sym + 7 * 4))
+            s = (opcode - pa->sym) % NBWLX;
+	    if ((pa->instr_type & OPC_BWLX) == OPC_WLX)
+	      {
+		/* We need to reject the xxxb opcodes that we accepted above.
+		   Note that pa->sym for WLX opcodes is the 'w' token,
+		   to get the 'b' token subtract one.  */
+		if (((opcode - pa->sym + 1) % NBWLX) == 0)
+		    continue;
+	        s++;
+	      }
+        } else if (it == OPC_SHIFT) {
+            if (!(opcode >= pa->sym && opcode < pa->sym + 7*NBWLX))
                 continue;
-            goto compute_size;
-        } else if (pa->instr_type & OPC_TEST) {
+            s = (opcode - pa->sym) % NBWLX;
+        } else if (it == OPC_TEST) {
             if (!(opcode >= pa->sym && opcode < pa->sym + NB_TEST_OPCODES))
                 continue;
+	    /* cmovxx is a test opcode but accepts multiple sizes.
+	       TCC doesn't accept the suffixed mnemonic, instead we 
+	       simply force size autodetection always.  */
+	    if (pa->instr_type & OPC_WLX)
+	        s = NBWLX - 1;
         } else if (pa->instr_type & OPC_B) {
-            if (!(opcode >= pa->sym && opcode <= pa->sym + 3))
+#ifdef TCC_TARGET_X86_64
+	    /* Some instructions don't have the full size but only
+	       bwl form.  insb e.g. */
+	    if ((pa->instr_type & OPC_WLQ) != OPC_WLQ
+		&& !(opcode >= pa->sym && opcode < pa->sym + NBWLX-1))
+	        continue;
+#endif
+            if (!(opcode >= pa->sym && opcode < pa->sym + NBWLX))
                 continue;
-        compute_size:
-            s = (opcode - pa->sym) & 3;
-        } else if (pa->instr_type & OPC_WL) {
-            if (!(opcode >= pa->sym && opcode <= pa->sym + 2))
+            s = opcode - pa->sym;
+        } else if (pa->instr_type & OPC_WLX) {
+            if (!(opcode >= pa->sym && opcode < pa->sym + NBWLX-1))
                 continue;
             s = opcode - pa->sym + 1;
         } else {
@@ -495,7 +781,17 @@ static void asm_opcode(TCCState *s1, int opcode)
         }
         if (pa->nb_ops != nb_ops)
             continue;
+#ifdef TCC_TARGET_X86_64
+	/* Special case for moves.  Selecting the IM64->REG64 form
+	   should only be done if we really have an >32bit imm64, and that
+	   is hardcoded.  Ignore it here.  */
+	if (pa->opcode == 0xb0 && ops[0].type != OP_IM64
+	    && (ops[1].type & OP_REG) == OP_REG64
+	    && !(pa->instr_type & OPC_0F))
+	    continue;
+#endif
         /* now decode and check each operand */
+	alltypes = 0;
         for(i = 0; i < nb_ops; i++) {
             int op1, op2;
             op1 = pa->op_type[i];
@@ -505,73 +801,146 @@ static void asm_opcode(TCCState *s1, int opcode)
                 v = OP_IM8 | OP_IM16 | OP_IM32;
                 break;
             case OPT_REG:
-                v = OP_REG8 | OP_REG16 | OP_REG32;
+                v = OP_REG8 | OP_REG16 | OP_REG32 | OP_REG64;
                 break;
             case OPT_REGW:
-                v = OP_REG16 | OP_REG32;
+                v = OP_REG16 | OP_REG32 | OP_REG64;
                 break;
             case OPT_IMW:
                 v = OP_IM16 | OP_IM32;
                 break;
+	    case OPT_MMXSSE:
+		v = OP_MMX | OP_SSE;
+		break;
+	    case OPT_DISP:
+	    case OPT_DISP8:
+		v = OP_ADDR;
+		break;
             default:
                 v = 1 << op2;
                 break;
             }
             if (op1 & OPT_EA)
                 v |= OP_EA;
-            op_type[i] = v;
+	    op_type[i] = v;
             if ((ops[i].type & v) == 0)
                 goto next;
+	    alltypes |= ops[i].type;
         }
         /* all is matching ! */
         break;
     next: ;
     }
     if (pa->sym == 0) {
-        if (opcode >= TOK_ASM_pusha && opcode <= TOK_ASM_emms) {
+        if (opcode >= TOK_ASM_first && opcode <= TOK_ASM_last) {
             int b;
-            b = op0_codes[opcode - TOK_ASM_pusha];
+            b = op0_codes[opcode - TOK_ASM_first];
             if (b & 0xff00) 
                 g(b >> 8);
             g(b);
             return;
+        } else if (opcode <= TOK_ASM_alllast) {
+            tcc_error("bad operand with opcode '%s'",
+                  get_tok_str(opcode, NULL));
         } else {
-            error("unknown opcode '%s'", 
+            tcc_error("unknown opcode '%s'",
                   get_tok_str(opcode, NULL));
         }
     }
     /* if the size is unknown, then evaluate it (OPC_B or OPC_WL case) */
-    if (s == 3) {
-        for(i = 0; s == 3 && i < nb_ops; i++) {
+    autosize = NBWLX-1;
+#ifdef TCC_TARGET_X86_64
+    /* XXX the autosize should rather be zero, to not have to adjust this
+       all the time.  */
+    if ((pa->instr_type & OPC_BWLQ) == OPC_B)
+        autosize = NBWLX-2;
+#endif
+    if (s == autosize) {
+	/* Check for register operands providing hints about the size.
+	   Start from the end, i.e. destination operands.  This matters
+	   only for opcodes accepting different sized registers, lar and lsl
+	   are such opcodes.  */
+        for(i = nb_ops - 1; s == autosize && i >= 0; i--) {
             if ((ops[i].type & OP_REG) && !(op_type[i] & (OP_CL | OP_DX)))
                 s = reg_to_size[ops[i].type & OP_REG];
         }
-        if (s == 3) {
-            if ((opcode == TOK_ASM_push || opcode == TOK_ASM_pop) && 
+        if (s == autosize) {
+            if ((opcode == TOK_ASM_push || opcode == TOK_ASM_pop) &&
                 (ops[0].type & (OP_SEG | OP_IM8S | OP_IM32)))
                 s = 2;
+	    else if ((opcode == TOK_ASM_push || opcode == TOK_ASM_pop) &&
+		     (ops[0].type & OP_EA))
+	        s = NBWLX - 2;
             else
-                error("cannot infer opcode suffix");
+                tcc_error("cannot infer opcode suffix");
         }
     }
 
+#ifdef TCC_TARGET_X86_64
+    /* Generate addr32 prefix if needed */
+    for(i = 0; i < nb_ops; i++) {
+        if (ops[i].type & OP_EA32) {
+	    g(0x67);
+	    break;
+        }
+    }
+#endif
     /* generate data16 prefix if needed */
-    ss = s;
-    if (s == 1 || (pa->instr_type & OPC_D16))
-        g(WORD_PREFIX_OPCODE);
-    else if (s == 2)
-        s = 1;
+    p66 = 0;
+    if (s == 1)
+        p66 = 1;
+    else {
+	/* accepting mmx+sse in all operands --> needs 0x66 to
+	   switch to sse mode.  Accepting only sse in an operand --> is
+	   already SSE insn and needs 0x66/f2/f3 handling.  */
+        for (i = 0; i < nb_ops; i++)
+            if ((op_type[i] & (OP_MMX | OP_SSE)) == (OP_MMX | OP_SSE)
+	        && ops[i].type & OP_SSE)
+	        p66 = 1;
+    }
+    if (p66)
+        g(0x66);
+#ifdef TCC_TARGET_X86_64
+    rex64 = 0;
+    if (pa->instr_type & OPC_48)
+        rex64 = 1;
+    else if (s == 3 || (alltypes & OP_REG64)) {
+        /* generate REX prefix */
+	int default64 = 0;
+	for(i = 0; i < nb_ops; i++) {
+	    if (op_type[i] == OP_REG64 && pa->opcode != 0xb8) {
+		/* If only 64bit regs are accepted in one operand
+		   this is a default64 instruction without need for
+		   REX prefixes, except for movabs(0xb8).  */
+		default64 = 1;
+		break;
+	    }
+	}
+	/* XXX find better encoding for the default64 instructions.  */
+        if (((opcode != TOK_ASM_push && opcode != TOK_ASM_pop
+	      && opcode != TOK_ASM_pushw && opcode != TOK_ASM_pushl
+	      && opcode != TOK_ASM_pushq && opcode != TOK_ASM_popw
+	      && opcode != TOK_ASM_popl && opcode != TOK_ASM_popq
+	      && opcode != TOK_ASM_call && opcode != TOK_ASM_jmp))
+	    && !default64)
+            rex64 = 1;
+    }
+#endif
+
     /* now generates the operation */
-    if (pa->instr_type & OPC_FWAIT)
+    if (OPCT_IS(pa->instr_type, OPC_FWAIT))
         g(0x9b);
     if (seg_prefix)
         g(seg_prefix);
 
     v = pa->opcode;
-    if (v == 0x69 || v == 0x69) {
+    if (pa->instr_type & OPC_0F)
+        v = ((v & ~0xff) << 8) | 0x0f00 | (v & 0xff);
+    if ((v == 0x69 || v == 0x6b) && nb_ops == 2) {
         /* kludge for imul $im, %reg */
         nb_ops = 3;
         ops[2] = ops[1];
+        op_type[2] = op_type[1];
     } else if (v == 0xcd && ops[0].e.v == 3 && !ops[0].e.sym) {
         v--; /* int $3 case */
         nb_ops = 0;
@@ -585,27 +954,69 @@ static void asm_opcode(TCCState *s1, int opcode)
         nb_ops = 0;
     } else if (v <= 0x05) {
         /* arith case */
-        v += ((opcode - TOK_ASM_addb) >> 2) << 3;
-    } else if ((pa->instr_type & (OPC_FARITH | OPC_MODRM)) == OPC_FARITH) {
+        v += ((opcode - TOK_ASM_addb) / NBWLX) << 3;
+    } else if ((pa->instr_type & (OPCT_MASK | OPC_MODRM)) == OPC_FARITH) {
         /* fpu arith case */
         v += ((opcode - pa->sym) / 6) << 3;
     }
+
+    /* search which operand will be used for modrm */
+    modrm_index = -1;
+    modreg_index = -1;
+    if (pa->instr_type & OPC_MODRM) {
+	if (!nb_ops) {
+	    /* A modrm opcode without operands is a special case (e.g. mfence).
+	       It has a group and acts as if there's an register operand 0
+	       (ax).  */
+	    i = 0;
+	    ops[i].type = OP_REG;
+	    ops[i].reg = 0;
+	    goto modrm_found;
+	}
+        /* first look for an ea operand */
+        for(i = 0;i < nb_ops; i++) {
+            if (op_type[i] & OP_EA)
+                goto modrm_found;
+        }
+        /* then if not found, a register or indirection (shift instructions) */
+        for(i = 0;i < nb_ops; i++) {
+            if (op_type[i] & (OP_REG | OP_MMX | OP_SSE | OP_INDIR))
+                goto modrm_found;
+        }
+#ifdef ASM_DEBUG
+        tcc_error("bad op table");
+#endif
+    modrm_found:
+        modrm_index = i;
+        /* if a register is used in another operand then it is
+           used instead of group */
+        for(i = 0;i < nb_ops; i++) {
+            int t = op_type[i];
+            if (i != modrm_index &&
+                (t & (OP_REG | OP_MMX | OP_SSE | OP_CR | OP_TR | OP_DB | OP_SEG))) {
+                modreg_index = i;
+                break;
+            }
+        }
+    }
+#ifdef TCC_TARGET_X86_64
+    asm_rex (rex64, ops, nb_ops, op_type, modreg_index, modrm_index);
+#endif
+
     if (pa->instr_type & OPC_REG) {
+        /* mov $im, %reg case */
+        if (v == 0xb0 && s >= 1)
+            v += 7;
         for(i = 0; i < nb_ops; i++) {
             if (op_type[i] & (OP_REG | OP_ST)) {
                 v += ops[i].reg;
                 break;
             }
         }
-        /* mov $im, %reg case */
-        if (pa->opcode == 0xb0 && s >= 1)
-            v += 7;
     }
     if (pa->instr_type & OPC_B)
-        v += s;
-    if (pa->instr_type & OPC_TEST)
-        v += test_bits[opcode - pa->sym]; 
-    if (pa->instr_type & OPC_SHORTJMP) {
+        v += s >= 1;
+    if (nb_ops == 1 && pa->op_type[0] == OPT_DISP8) {
         Sym *sym;
         int jmp_disp;
 
@@ -615,122 +1026,106 @@ static void asm_opcode(TCCState *s1, int opcode)
             goto no_short_jump;
         if (sym->r != cur_text_section->sh_num)
             goto no_short_jump;
-        jmp_disp = ops[0].e.v + (long)sym->next - ind - 2;
+        jmp_disp = ops[0].e.v + sym->jnext - ind - 2 - (v >= 0xff);
         if (jmp_disp == (int8_t)jmp_disp) {
             /* OK to generate jump */
-            is_short_jmp = 1;
+	    ops[0].e.sym = 0;
             ops[0].e.v = jmp_disp;
+	    op_type[0] = OP_IM8S;
         } else {
         no_short_jump:
-            if (pa->instr_type & OPC_JMP) {
-                /* long jump will be allowed. need to modify the
-                   opcode slightly */
-                if (v == 0xeb)
-                    v = 0xe9;
-                else 
-                    v += 0x0f10;
-            } else {
-                error("invalid displacement");
-            }
+	    /* long jump will be allowed. need to modify the
+	       opcode slightly */
+	    if (v == 0xeb) /* jmp */
+	        v = 0xe9;
+	    else if (v == 0x70) /* jcc */
+	        v += 0x0f10;
+	    else
+	        tcc_error("invalid displacement");
         }
     }
-    op1 = v >> 8;
+    if (OPCT_IS(pa->instr_type, OPC_TEST))
+        v += test_bits[opcode - pa->sym];
+    op1 = v >> 16;
+    if (op1)
+        g(op1);
+    op1 = (v >> 8) & 0xff;
     if (op1)
         g(op1);
     g(v);
-        
-    /* search which operand will used for modrm */
-    modrm_index = 0;
-    if (pa->instr_type & OPC_SHIFT) {
-        reg = (opcode - pa->sym) >> 2; 
+
+    if (OPCT_IS(pa->instr_type, OPC_SHIFT)) {
+        reg = (opcode - pa->sym) / NBWLX;
         if (reg == 6)
             reg = 7;
-    } else if (pa->instr_type & OPC_ARITH) {
-        reg = (opcode - pa->sym) >> 2;
-    } else if (pa->instr_type & OPC_FARITH) {
+    } else if (OPCT_IS(pa->instr_type, OPC_ARITH)) {
+        reg = (opcode - pa->sym) / NBWLX;
+    } else if (OPCT_IS(pa->instr_type, OPC_FARITH)) {
         reg = (opcode - pa->sym) / 6;
     } else {
         reg = (pa->instr_type >> OPC_GROUP_SHIFT) & 7;
     }
+
+    pc = 0;
     if (pa->instr_type & OPC_MODRM) {
-        /* first look for an ea operand */
-        for(i = 0;i < nb_ops; i++) {
-            if (op_type[i] & OP_EA)
-                goto modrm_found;
-        }
-        /* then if not found, a register or indirection (shift instructions) */
-        for(i = 0;i < nb_ops; i++) {
-            if (op_type[i] & (OP_REG | OP_MMX | OP_SSE | OP_INDIR))
-                goto modrm_found;
-        }
-#ifdef ASM_DEBUG
-        error("bad op table");
-#endif      
-    modrm_found:
-        modrm_index = i;
         /* if a register is used in another operand then it is
            used instead of group */
-        for(i = 0;i < nb_ops; i++) {
-            v = op_type[i];
-            if (i != modrm_index && 
-                (v & (OP_REG | OP_MMX | OP_SSE | OP_CR | OP_TR | OP_DB | OP_SEG))) {
-                reg = ops[i].reg;
-                break;
-            }
-        }
-
-        asm_modrm(reg, &ops[modrm_index]);
+	if (modreg_index >= 0)
+	    reg = ops[modreg_index].reg;
+        pc = asm_modrm(reg, &ops[modrm_index]);
     }
 
     /* emit constants */
-    if (pa->opcode == 0x9a || pa->opcode == 0xea) {
+#ifndef TCC_TARGET_X86_64
+    if (!(pa->instr_type & OPC_0F)
+	&& (pa->opcode == 0x9a || pa->opcode == 0xea)) {
         /* ljmp or lcall kludge */
-        gen_expr32(&ops[1].e);
+	gen_expr32(&ops[1].e);
         if (ops[0].e.sym)
-            error("cannot relocate");
+            tcc_error("cannot relocate");
         gen_le16(ops[0].e.v);
-    } else {
-        for(i = 0;i < nb_ops; i++) {
-            v = op_type[i];
-            if (v & (OP_IM8 | OP_IM16 | OP_IM32 | OP_IM8S | OP_ADDR)) {
-                /* if multiple sizes are given it means we must look
-                   at the op size */
-                if (v == (OP_IM8 | OP_IM16 | OP_IM32) ||
-                    v == (OP_IM16 | OP_IM32)) {
-                    if (ss == 0)
-                        v = OP_IM8;
-                    else if (ss == 1)
-                        v = OP_IM16;
-                    else
-                        v = OP_IM32;
-                }
-                if (v & (OP_IM8 | OP_IM8S)) {
-                    if (ops[i].e.sym)
-                        goto error_relocate;
-                    g(ops[i].e.v);
-                } else if (v & OP_IM16) {
-                    if (ops[i].e.sym) {
-                    error_relocate:
-                        error("cannot relocate");
-                    }
-                    gen_le16(ops[i].e.v);
-                } else {
-                    if (pa->instr_type & (OPC_JMP | OPC_SHORTJMP)) {
-                        if (is_short_jmp)
-                            g(ops[i].e.v);
-                        else
-                            gen_disp32(&ops[i].e);
-                    } else {
-                        gen_expr32(&ops[i].e);
-                    }
-                }
+        return;
+    }
+#endif
+    for(i = 0;i < nb_ops; i++) {
+        v = op_type[i];
+        if (v & (OP_IM8 | OP_IM16 | OP_IM32 | OP_IM64 | OP_IM8S | OP_ADDR)) {
+            /* if multiple sizes are given it means we must look
+               at the op size */
+            if ((v | OP_IM8 | OP_IM64) == (OP_IM8 | OP_IM16 | OP_IM32 | OP_IM64)) {
+                if (s == 0)
+                    v = OP_IM8;
+                else if (s == 1)
+                    v = OP_IM16;
+                else if (s == 2 || (v & OP_IM64) == 0)
+                    v = OP_IM32;
+                else
+                    v = OP_IM64;
+            }
+
+            if ((v & (OP_IM8 | OP_IM8S | OP_IM16)) && ops[i].e.sym)
+                tcc_error("cannot relocate");
+
+            if (v & (OP_IM8 | OP_IM8S)) {
+                g(ops[i].e.v);
+            } else if (v & OP_IM16) {
+                gen_le16(ops[i].e.v);
+#ifdef TCC_TARGET_X86_64
+            } else if (v & OP_IM64) {
+                gen_expr64(&ops[i].e);
+#endif
+	    } else if (pa->op_type[i] == OPT_DISP || pa->op_type[i] == OPT_DISP8) {
+                gen_disp32(&ops[i].e);
+            } else {
+                gen_expr32(&ops[i].e);
             }
         }
     }
-}
 
-#define NB_SAVED_REGS 3
-#define NB_ASM_REGS 8
+    /* after immediate operands, adjust pc-relative address */
+    if (pc)
+        add32le(cur_text_section->data + pc - 4, pc - ind);
+}
 
 /* return the constraint priority (we allocate first the lowest
    numbered constraints) */
@@ -761,18 +1156,21 @@ static inline int constraint_priority(const char *str)
             pr = 2;
             break;
         case 'r':
+	case 'R':
+	case 'p':
             pr = 3;
             break;
         case 'N':
         case 'M':
         case 'I':
+	case 'e':
         case 'i':
         case 'm':
         case 'g':
             pr = 4;
             break;
         default:
-            error("unknown constraint '%c'", c);
+            tcc_error("unknown constraint '%c'", c);
             pr = 0;
         }
         if (pr > priority)
@@ -788,13 +1186,35 @@ static const char *skip_constraint_modifiers(const char *p)
     return p;
 }
 
+/* If T (a token) is of the form "%reg" returns the register
+   number and type, otherwise return -1.  */
+ST_FUNC int asm_parse_regvar (int t)
+{
+    const char *s;
+    Operand op;
+    if (t < TOK_IDENT)
+        return -1;
+    s = table_ident[t - TOK_IDENT]->str;
+    if (s[0] != '%')
+        return -1;
+    t = tok_alloc(s+1, strlen(s)-1)->tok;
+    unget_tok(t);
+    unget_tok('%');
+    parse_operand(tcc_state, &op);
+    /* Accept only integer regs for now.  */
+    if (op.type & OP_REG)
+        return op.reg;
+    else
+        return -1;
+}
+
 #define REG_OUT_MASK 0x01
 #define REG_IN_MASK  0x02
 
 #define is_reg_allocated(reg) (regs_allocated[reg] & reg_mask)
 
-static void asm_compute_constraints(ASMOperand *operands, 
-                                    int nb_operands, int nb_outputs, 
+ST_FUNC void asm_compute_constraints(ASMOperand *operands,
+                                    int nb_operands, int nb_outputs,
                                     const uint8_t *clobber_regs,
                                     int *pout_reg)
 {
@@ -803,7 +1223,7 @@ static void asm_compute_constraints(ASMOperand *operands,
     int i, j, k, p1, p2, tmp, reg, c, reg_mask;
     const char *str;
     uint8_t regs_allocated[NB_ASM_REGS];
-    
+
     /* init fields */
     for(i=0;i<nb_operands;i++) {
         op = &operands[i];
@@ -823,24 +1243,29 @@ static void asm_compute_constraints(ASMOperand *operands,
             /* this is a reference to another constraint */
             k = find_constraint(operands, nb_operands, str, NULL);
             if ((unsigned)k >= i || i < nb_outputs)
-                error("invalid reference in constraint %d ('%s')",
+                tcc_error("invalid reference in constraint %d ('%s')",
                       i, str);
             op->ref_index = k;
             if (operands[k].input_index >= 0)
-                error("cannot reference twice the same operand");
+                tcc_error("cannot reference twice the same operand");
             operands[k].input_index = i;
             op->priority = 5;
+	} else if ((op->vt->r & VT_VALMASK) == VT_LOCAL
+		   && op->vt->sym
+		   && (reg = op->vt->sym->r & VT_VALMASK) < VT_CONST) {
+	    op->priority = 1;
+	    op->reg = reg;
         } else {
             op->priority = constraint_priority(str);
         }
     }
-    
+
     /* sort operands according to their priority */
     for(i=0;i<nb_operands;i++)
         sorted_op[i] = i;
     for(i=0;i<nb_operands - 1;i++) {
         for(j=i+1;j<nb_operands;j++) {
-            p1 = operands[sorted_op[i]].priority; 
+            p1 = operands[sorted_op[i]].priority;
             p2 = operands[sorted_op[j]].priority;
             if (p2 < p1) {
                 tmp = sorted_op[i];
@@ -857,9 +1282,9 @@ static void asm_compute_constraints(ASMOperand *operands,
             regs_allocated[i] = 0;
     }
     /* esp cannot be used */
-    regs_allocated[4] = REG_IN_MASK | REG_OUT_MASK; 
+    regs_allocated[4] = REG_IN_MASK | REG_OUT_MASK;
     /* ebp cannot be used yet */
-    regs_allocated[5] = REG_IN_MASK | REG_OUT_MASK; 
+    regs_allocated[5] = REG_IN_MASK | REG_OUT_MASK;
 
     /* allocate registers and generate corresponding asm moves */
     for(i=0;i<nb_operands;i++) {
@@ -877,6 +1302,12 @@ static void asm_compute_constraints(ASMOperand *operands,
         } else {
             reg_mask = REG_IN_MASK;
         }
+	if (op->reg >= 0) {
+	    if (is_reg_allocated(op->reg))
+	        tcc_error("asm regvar requests register that's taken already");
+	    reg = op->reg;
+	    goto reg_found;
+	}
     try_next:
         c = *str++;
         switch(c) {
@@ -887,30 +1318,30 @@ static void asm_compute_constraints(ASMOperand *operands,
             /* FALL THRU */
         case '&':
             if (j >= nb_outputs)
-                error("'%c' modifier can only be applied to outputs", c);
+                tcc_error("'%c' modifier can only be applied to outputs", c);
             reg_mask = REG_IN_MASK | REG_OUT_MASK;
             goto try_next;
         case 'A':
             /* allocate both eax and edx */
-            if (is_reg_allocated(TREG_EAX) || 
-                is_reg_allocated(TREG_EDX))
+            if (is_reg_allocated(TREG_XAX) ||
+                is_reg_allocated(TREG_XDX))
                 goto try_next;
             op->is_llong = 1;
-            op->reg = TREG_EAX;
-            regs_allocated[TREG_EAX] |= reg_mask;
-            regs_allocated[TREG_EDX] |= reg_mask;
+            op->reg = TREG_XAX;
+            regs_allocated[TREG_XAX] |= reg_mask;
+            regs_allocated[TREG_XDX] |= reg_mask;
             break;
         case 'a':
-            reg = TREG_EAX;
+            reg = TREG_XAX;
             goto alloc_reg;
         case 'b':
             reg = 3;
             goto alloc_reg;
         case 'c':
-            reg = TREG_ECX;
+            reg = TREG_XCX;
             goto alloc_reg;
         case 'd':
-            reg = TREG_EDX;
+            reg = TREG_XDX;
             goto alloc_reg;
         case 'S':
             reg = 6;
@@ -929,6 +1360,8 @@ static void asm_compute_constraints(ASMOperand *operands,
             }
             goto try_next;
         case 'r':
+	case 'R':
+	case 'p': /* A general address, for x86(64) any register is acceptable*/
             /* any general register */
             for(reg = 0; reg < 8; reg++) {
                 if (!is_reg_allocated(reg))
@@ -941,6 +1374,7 @@ static void asm_compute_constraints(ASMOperand *operands,
             op->reg = reg;
             regs_allocated[reg] |= reg_mask;
             break;
+	case 'e':
         case 'i':
             if (!((op->vt->r & (VT_VALMASK | VT_LVAL)) == VT_CONST))
                 goto try_next;
@@ -978,7 +1412,7 @@ static void asm_compute_constraints(ASMOperand *operands,
             }
             break;
         default:
-            error("asm constraint %d ('%s') could not be satisfied", 
+            tcc_error("asm constraint %d ('%s') could not be satisfied",
                   j, op->constraint);
             break;
         }
@@ -988,34 +1422,34 @@ static void asm_compute_constraints(ASMOperand *operands,
             operands[op->input_index].is_llong = op->is_llong;
         }
     }
-    
+
     /* compute out_reg. It is used to store outputs registers to memory
        locations references by pointers (VT_LLOCAL case) */
     *pout_reg = -1;
     for(i=0;i<nb_operands;i++) {
         op = &operands[i];
-        if (op->reg >= 0 && 
+        if (op->reg >= 0 &&
             (op->vt->r & VT_VALMASK) == VT_LLOCAL  &&
             !op->is_memory) {
             for(reg = 0; reg < 8; reg++) {
                 if (!(regs_allocated[reg] & REG_OUT_MASK))
                     goto reg_found2;
             }
-            error("could not find free output register for reloading");
+            tcc_error("could not find free output register for reloading");
         reg_found2:
             *pout_reg = reg;
             break;
         }
     }
-    
+
     /* print sorted constraints */
 #ifdef ASM_DEBUG
     for(i=0;i<nb_operands;i++) {
         j = sorted_op[i];
         op = &operands[j];
-        printf("%%%d [%s]: \"%s\" r=0x%04x reg=%d\n", 
-               j,                
-               op->id ? get_tok_str(op->id, NULL) : "", 
+        printf("%%%d [%s]: \"%s\" r=0x%04x reg=%d\n",
+               j,
+               op->id ? get_tok_str(op->id, NULL) : "",
                op->constraint,
                op->vt->r,
                op->reg);
@@ -1025,7 +1459,7 @@ static void asm_compute_constraints(ASMOperand *operands,
 #endif
 }
 
-static void subst_asm_operand(CString *add_str, 
+ST_FUNC void subst_asm_operand(CString *add_str,
                               SValue *sv, int modifier)
 {
     int r, reg, size, val;
@@ -1033,42 +1467,70 @@ static void subst_asm_operand(CString *add_str,
 
     r = sv->r;
     if ((r & VT_VALMASK) == VT_CONST) {
-        if (!(r & VT_LVAL) && modifier != 'c' && modifier != 'n')
+        if (!(r & VT_LVAL) && modifier != 'c' && modifier != 'n' &&
+	    modifier != 'P')
             cstr_ccat(add_str, '$');
         if (r & VT_SYM) {
-            cstr_cat(add_str, get_tok_str(sv->sym->v, NULL));
-            if (sv->c.i != 0) {
-                cstr_ccat(add_str, '+');
-            } else {
-                return;
-            }
+	    const char *name = get_tok_str(sv->sym->v, NULL);
+	    if (sv->sym->v >= SYM_FIRST_ANOM) {
+		/* In case of anonymous symbols ("L.42", used
+		   for static data labels) we can't find them
+		   in the C symbol table when later looking up
+		   this name.  So enter them now into the asm label
+		   list when we still know the symbol.  */
+		get_asm_sym(tok_alloc(name, strlen(name))->tok, sv->sym);
+	    }
+            cstr_cat(add_str, name, -1);
+            if ((uint32_t)sv->c.i == 0)
+                goto no_offset;
+	    cstr_ccat(add_str, '+');
         }
         val = sv->c.i;
         if (modifier == 'n')
             val = -val;
-        snprintf(buf, sizeof(buf), "%d", sv->c.i);
-        cstr_cat(add_str, buf);
+        snprintf(buf, sizeof(buf), "%d", (int)sv->c.i);
+        cstr_cat(add_str, buf, -1);
+    no_offset:;
+#ifdef TCC_TARGET_X86_64
+        if (r & VT_LVAL)
+            cstr_cat(add_str, "(%rip)", -1);
+#endif
     } else if ((r & VT_VALMASK) == VT_LOCAL) {
-        snprintf(buf, sizeof(buf), "%d(%%ebp)", sv->c.i);
-        cstr_cat(add_str, buf);
+#ifdef TCC_TARGET_X86_64
+        snprintf(buf, sizeof(buf), "%d(%%rbp)", (int)sv->c.i);
+#else
+        snprintf(buf, sizeof(buf), "%d(%%ebp)", (int)sv->c.i);
+#endif
+        cstr_cat(add_str, buf, -1);
     } else if (r & VT_LVAL) {
         reg = r & VT_VALMASK;
         if (reg >= VT_CONST)
-            error("internal compiler error");
-        snprintf(buf, sizeof(buf), "(%%%s)", 
-                 get_tok_str(TOK_ASM_eax + reg, NULL));
-        cstr_cat(add_str, buf);
+            tcc_error("internal compiler error");
+        snprintf(buf, sizeof(buf), "(%%%s)",
+#ifdef TCC_TARGET_X86_64
+                 get_tok_str(TOK_ASM_rax + reg, NULL)
+#else
+                 get_tok_str(TOK_ASM_eax + reg, NULL)
+#endif
+		 );
+        cstr_cat(add_str, buf, -1);
     } else {
         /* register case */
         reg = r & VT_VALMASK;
         if (reg >= VT_CONST)
-            error("internal compiler error");
+            tcc_error("internal compiler error");
 
         /* choose register operand size */
-        if ((sv->type.t & VT_BTYPE) == VT_BYTE)
+        if ((sv->type.t & VT_BTYPE) == VT_BYTE ||
+	    (sv->type.t & VT_BTYPE) == VT_BOOL)
             size = 1;
         else if ((sv->type.t & VT_BTYPE) == VT_SHORT)
             size = 2;
+#ifdef TCC_TARGET_X86_64
+        else if ((sv->type.t & VT_BTYPE) == VT_LLONG ||
+		 (sv->type.t & VT_BTYPE) == VT_PTR)
+            size = 8;
+#endif
         else
             size = 4;
         if (size == 1 && reg >= 4)
@@ -1076,14 +1538,20 @@ static void subst_asm_operand(CString *add_str,
 
         if (modifier == 'b') {
             if (reg >= 4)
-                error("cannot use byte register");
+                tcc_error("cannot use byte register");
             size = 1;
         } else if (modifier == 'h') {
             if (reg >= 4)
-                error("cannot use byte register");
+                tcc_error("cannot use byte register");
             size = -1;
         } else if (modifier == 'w') {
             size = 2;
+        } else if (modifier == 'k') {
+            size = 4;
+#ifdef TCC_TARGET_X86_64
+        } else if (modifier == 'q') {
+            size = 8;
+#endif
         }
 
         switch(size) {
@@ -1099,14 +1567,19 @@ static void subst_asm_operand(CString *add_str,
         default:
             reg = TOK_ASM_eax + reg;
             break;
+#ifdef TCC_TARGET_X86_64
+        case 8:
+            reg = TOK_ASM_rax + reg;
+            break;
+#endif
         }
         snprintf(buf, sizeof(buf), "%%%s", get_tok_str(reg, NULL));
-        cstr_cat(add_str, buf);
+        cstr_cat(add_str, buf, -1);
     }
 }
 
 /* generate prolog and epilog code for asm statement */
-static void asm_gen_code(ASMOperand *operands, int nb_operands, 
+ST_FUNC void asm_gen_code(ASMOperand *operands, int nb_operands,
                          int nb_outputs, int is_output,
                          uint8_t *clobber_regs,
                          int out_reg)
@@ -1114,7 +1587,18 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
     uint8_t regs_allocated[NB_ASM_REGS];
     ASMOperand *op;
     int i, reg;
-    static uint8_t reg_saved[NB_SAVED_REGS] = { 3, 6, 7 };
+
+    /* Strictly speaking %Xbp and %Xsp should be included in the
+       call-preserved registers, but currently it doesn't matter.  */
+#ifdef TCC_TARGET_X86_64
+#ifdef TCC_TARGET_PE
+    static uint8_t reg_saved[] = { 3, 6, 7, 12, 13, 14, 15 };
+#else
+    static uint8_t reg_saved[] = { 3, 12, 13, 14, 15 };
+#endif
+#else
+    static uint8_t reg_saved[] = { 3, 6, 7 };
+#endif
 
     /* mark all used registers */
     memcpy(regs_allocated, clobber_regs, sizeof(regs_allocated));
@@ -1125,10 +1609,13 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
     }
     if (!is_output) {
         /* generate reg save code */
-        for(i = 0; i < NB_SAVED_REGS; i++) {
+        for(i = 0; i < sizeof(reg_saved)/sizeof(reg_saved[0]); i++) {
             reg = reg_saved[i];
-            if (regs_allocated[reg]) 
+            if (regs_allocated[reg]) {
+		if (reg >= 8)
+		  g(0x41), reg-=8;
                 g(0x50 + reg);
+            }
         }
 
         /* generate load code */
@@ -1141,7 +1628,8 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
                        output cases) */
                     SValue sv;
                     sv = *op->vt;
-                    sv.r = (sv.r & ~VT_VALMASK) | VT_LOCAL;
+                    sv.r = (sv.r & ~VT_VALMASK) | VT_LOCAL | VT_LVAL;
+                    sv.type.t = VT_PTR;
                     load(op->reg, &sv);
                 } else if (i >= nb_outputs || op->is_rw) {
                     /* load value in register */
@@ -1149,8 +1637,8 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
                     if (op->is_llong) {
                         SValue sv;
                         sv = *op->vt;
-                        sv.c.ul += 4;
-                        load(TREG_EDX, &sv);
+                        sv.c.i += 4;
+                        load(TREG_XDX, &sv);
                     }
                 }
             }
@@ -1165,8 +1653,10 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
                         SValue sv;
                         sv = *op->vt;
                         sv.r = (sv.r & ~VT_VALMASK) | VT_LOCAL;
+			sv.type.t = VT_PTR;
                         load(out_reg, &sv);
 
+			sv = *op->vt;
                         sv.r = (sv.r & ~VT_VALMASK) | out_reg;
                         store(op->reg, &sv);
                     }
@@ -1175,28 +1665,35 @@ static void asm_gen_code(ASMOperand *operands, int nb_operands,
                     if (op->is_llong) {
                         SValue sv;
                         sv = *op->vt;
-                        sv.c.ul += 4;
-                        store(TREG_EDX, &sv);
+                        sv.c.i += 4;
+                        store(TREG_XDX, &sv);
                     }
                 }
             }
         }
         /* generate reg restore code */
-        for(i = NB_SAVED_REGS - 1; i >= 0; i--) {
+        for(i = sizeof(reg_saved)/sizeof(reg_saved[0]) - 1; i >= 0; i--) {
             reg = reg_saved[i];
-            if (regs_allocated[reg]) 
+            if (regs_allocated[reg]) {
+		if (reg >= 8)
+		  g(0x41), reg-=8;
                 g(0x58 + reg);
+            }
         }
     }
 }
 
-static void asm_clobber(uint8_t *clobber_regs, const char *str)
+ST_FUNC void asm_clobber(uint8_t *clobber_regs, const char *str)
 {
     int reg;
     TokenSym *ts;
+#ifdef TCC_TARGET_X86_64
+    unsigned int type;
+#endif
 
-    if (!strcmp(str, "memory") || 
-        !strcmp(str, "cc"))
+    if (!strcmp(str, "memory") ||
+        !strcmp(str, "cc") ||
+	!strcmp(str, "flags"))
         return;
     ts = tok_alloc(str, strlen(str));
     reg = ts->tok;
@@ -1204,8 +1701,14 @@ static void asm_clobber(uint8_t *clobber_regs, const char *str)
         reg -= TOK_ASM_eax;
     } else if (reg >= TOK_ASM_ax && reg <= TOK_ASM_di) {
         reg -= TOK_ASM_ax;
+#ifdef TCC_TARGET_X86_64
+    } else if (reg >= TOK_ASM_rax && reg <= TOK_ASM_rdi) {
+        reg -= TOK_ASM_rax;
+    } else if ((reg = asm_parse_numeric_reg(reg, &type)) >= 0) {
+	;
+#endif
     } else {
-        error("invalid clobber register '%s'", str);
+        tcc_error("invalid clobber register '%s'", str);
     }
     clobber_regs[reg] = 1;
 }