diff third_party/luajit/src/vm_x86.dasc @ 178:94705b5986b3

[ThirdParty] Added WRK and luajit for load testing.
author MrJuneJune <me@mrjunejune.com>
date Thu, 22 Jan 2026 20:10:30 -0800
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/third_party/luajit/src/vm_x86.dasc	Thu Jan 22 20:10:30 2026 -0800
@@ -0,0 +1,5840 @@
+|// Low-level VM code for x86 CPUs.
+|// Bytecode interpreter, fast functions and helper functions.
+|// Copyright (C) 2005-2023 Mike Pall. See Copyright Notice in luajit.h
+|
+|.if P64
+|.arch x64
+|.else
+|.arch x86
+|.endif
+|.section code_op, code_sub
+|
+|.actionlist build_actionlist
+|.globals GLOB_
+|.globalnames globnames
+|.externnames extnames
+|
+|//-----------------------------------------------------------------------
+|
+|.if P64
+|.define X64, 1
+|.if WIN
+|.define X64WIN, 1
+|.endif
+|.endif
+|
+|// Fixed register assignments for the interpreter.
+|// This is very fragile and has many dependencies. Caveat emptor.
+|.define BASE,		edx		// Not C callee-save, refetched anyway.
+|.if not X64
+|.define KBASE,		edi		// Must be C callee-save.
+|.define KBASEa,	KBASE
+|.define PC,		esi		// Must be C callee-save.
+|.define PCa,		PC
+|.define DISPATCH,	ebx		// Must be C callee-save.
+|.elif X64WIN
+|.define KBASE,		edi		// Must be C callee-save.
+|.define KBASEa,	rdi
+|.define PC,		esi		// Must be C callee-save.
+|.define PCa,		rsi
+|.define DISPATCH,	ebx		// Must be C callee-save.
+|.else
+|.define KBASE,		r15d		// Must be C callee-save.
+|.define KBASEa,	r15
+|.define PC,		ebx		// Must be C callee-save.
+|.define PCa,		rbx
+|.define DISPATCH,	r14d		// Must be C callee-save.
+|.endif
+|
+|.define RA,		ecx
+|.define RAH,		ch
+|.define RAL,		cl
+|.define RB,		ebp		// Must be ebp (C callee-save).
+|.define RC,		eax		// Must be eax.
+|.define RCW,		ax
+|.define RCH,		ah
+|.define RCL,		al
+|.define OP,		RB
+|.define RD,		RC
+|.define RDW,		RCW
+|.define RDL,		RCL
+|.if X64
+|.define RAa, rcx
+|.define RBa, rbp
+|.define RCa, rax
+|.define RDa, rax
+|.else
+|.define RAa, RA
+|.define RBa, RB
+|.define RCa, RC
+|.define RDa, RD
+|.endif
+|
+|.if not X64
+|.define FCARG1,	ecx		// x86 fastcall arguments.
+|.define FCARG2,	edx
+|.elif X64WIN
+|.define CARG1,		rcx		// x64/WIN64 C call arguments.
+|.define CARG2,		rdx
+|.define CARG3,		r8
+|.define CARG4,		r9
+|.define CARG1d,	ecx
+|.define CARG2d,	edx
+|.define CARG3d,	r8d
+|.define CARG4d,	r9d
+|.define FCARG1,	CARG1d		// Upwards compatible to x86 fastcall.
+|.define FCARG2,	CARG2d
+|.else
+|.define CARG1,		rdi		// x64/POSIX C call arguments.
+|.define CARG2,		rsi
+|.define CARG3,		rdx
+|.define CARG4,		rcx
+|.define CARG5,		r8
+|.define CARG6,		r9
+|.define CARG1d,	edi
+|.define CARG2d,	esi
+|.define CARG3d,	edx
+|.define CARG4d,	ecx
+|.define CARG5d,	r8d
+|.define CARG6d,	r9d
+|.define FCARG1,	CARG1d		// Simulate x86 fastcall.
+|.define FCARG2,	CARG2d
+|.endif
+|
+|// Type definitions. Some of these are only used for documentation.
+|.type L,		lua_State
+|.type GL,		global_State
+|.type TVALUE,		TValue
+|.type GCOBJ,		GCobj
+|.type STR,		GCstr
+|.type TAB,		GCtab
+|.type LFUNC,		GCfuncL
+|.type CFUNC,		GCfuncC
+|.type PROTO,		GCproto
+|.type UPVAL,		GCupval
+|.type NODE,		Node
+|.type NARGS,		int
+|.type TRACE,		GCtrace
+|.type SBUF,		SBuf
+|
+|// Stack layout while in interpreter. Must match with lj_frame.h.
+|//-----------------------------------------------------------------------
+|.if not X64		// x86 stack layout.
+|
+|.if WIN
+|
+|.define CFRAME_SPACE,	aword*9			// Delta for esp (see <--).
+|.macro saveregs_
+|  push edi; push esi; push ebx
+|  push extern lj_err_unwind_win
+|  fs; push dword [0]
+|  fs; mov [0], esp
+|  sub esp, CFRAME_SPACE
+|.endmacro
+|.macro restoreregs
+|  add esp, CFRAME_SPACE
+|  fs; pop dword [0]
+|  pop edi	// Short for esp += 4.
+|  pop ebx; pop esi; pop edi; pop ebp
+|.endmacro
+|
+|.else
+|
+|.define CFRAME_SPACE,	aword*7			// Delta for esp (see <--).
+|.macro saveregs_
+|  push edi; push esi; push ebx
+|  sub esp, CFRAME_SPACE
+|.endmacro
+|.macro restoreregs
+|  add esp, CFRAME_SPACE
+|  pop ebx; pop esi; pop edi; pop ebp
+|.endmacro
+|
+|.endif
+|
+|.macro saveregs
+|  push ebp; saveregs_
+|.endmacro
+|
+|.if WIN
+|.define SAVE_ERRF,	aword [esp+aword*19]	// vm_pcall/vm_cpcall only.
+|.define SAVE_NRES,	aword [esp+aword*18]
+|.define SAVE_CFRAME,	aword [esp+aword*17]
+|.define SAVE_L,	aword [esp+aword*16]
+|//----- 16 byte aligned, ^^^ arguments from C caller
+|.define SAVE_RET,	aword [esp+aword*15]	//<-- esp entering interpreter.
+|.define SAVE_R4,	aword [esp+aword*14]
+|.define SAVE_R3,	aword [esp+aword*13]
+|.define SAVE_R2,	aword [esp+aword*12]
+|//----- 16 byte aligned
+|.define SAVE_R1,	aword [esp+aword*11]
+|.define SEH_FUNC,	aword [esp+aword*10]
+|.define SEH_NEXT,	aword [esp+aword*9]	//<-- esp after register saves.
+|.define UNUSED2,	aword [esp+aword*8]
+|//----- 16 byte aligned
+|.define UNUSED1,	aword [esp+aword*7]
+|.define SAVE_PC,	aword [esp+aword*6]
+|.define TMP2,		aword [esp+aword*5]
+|.define TMP1,		aword [esp+aword*4]
+|//----- 16 byte aligned
+|.define ARG4,		aword [esp+aword*3]
+|.define ARG3,		aword [esp+aword*2]
+|.define ARG2,		aword [esp+aword*1]
+|.define ARG1,		aword [esp]		//<-- esp while in interpreter.
+|//----- 16 byte aligned, ^^^ arguments for C callee
+|.else
+|.define SAVE_ERRF,	aword [esp+aword*15]	// vm_pcall/vm_cpcall only.
+|.define SAVE_NRES,	aword [esp+aword*14]
+|.define SAVE_CFRAME,	aword [esp+aword*13]
+|.define SAVE_L,	aword [esp+aword*12]
+|//----- 16 byte aligned, ^^^ arguments from C caller
+|.define SAVE_RET,	aword [esp+aword*11]	//<-- esp entering interpreter.
+|.define SAVE_R4,	aword [esp+aword*10]
+|.define SAVE_R3,	aword [esp+aword*9]
+|.define SAVE_R2,	aword [esp+aword*8]
+|//----- 16 byte aligned
+|.define SAVE_R1,	aword [esp+aword*7]	//<-- esp after register saves.
+|.define SAVE_PC,	aword [esp+aword*6]
+|.define TMP2,		aword [esp+aword*5]
+|.define TMP1,		aword [esp+aword*4]
+|//----- 16 byte aligned
+|.define ARG4,		aword [esp+aword*3]
+|.define ARG3,		aword [esp+aword*2]
+|.define ARG2,		aword [esp+aword*1]
+|.define ARG1,		aword [esp]		//<-- esp while in interpreter.
+|//----- 16 byte aligned, ^^^ arguments for C callee
+|.endif
+|
+|// FPARGx overlaps ARGx and ARG(x+1) on x86.
+|.define FPARG3,	qword [esp+qword*1]
+|.define FPARG1,	qword [esp]
+|// TMPQ overlaps TMP1/TMP2. ARG5/MULTRES overlap TMP1/TMP2 (and TMPQ).
+|.define TMPQ,		qword [esp+aword*4]
+|.define TMP3,		ARG4
+|.define ARG5,		TMP1
+|.define TMPa,		TMP1
+|.define MULTRES,	TMP2
+|
+|// Arguments for vm_call and vm_pcall.
+|.define INARG_BASE,	SAVE_CFRAME		// Overwritten by SAVE_CFRAME!
+|
+|// Arguments for vm_cpcall.
+|.define INARG_CP_CALL,	SAVE_ERRF
+|.define INARG_CP_UD,	SAVE_NRES
+|.define INARG_CP_FUNC,	SAVE_CFRAME
+|
+|//-----------------------------------------------------------------------
+|.elif X64WIN		// x64/Windows stack layout
+|
+|.define CFRAME_SPACE,	aword*5			// Delta for rsp (see <--).
+|.macro saveregs_
+|  push rdi; push rsi; push rbx
+|  sub rsp, CFRAME_SPACE
+|.endmacro
+|.macro saveregs
+|  push rbp; saveregs_
+|.endmacro
+|.macro restoreregs
+|  add rsp, CFRAME_SPACE
+|  pop rbx; pop rsi; pop rdi; pop rbp
+|.endmacro
+|
+|.define SAVE_CFRAME,	aword [rsp+aword*13]
+|.define SAVE_PC,	dword [rsp+dword*25]
+|.define SAVE_L,	dword [rsp+dword*24]
+|.define SAVE_ERRF,	dword [rsp+dword*23]
+|.define SAVE_NRES,	dword [rsp+dword*22]
+|.define TMP2,		dword [rsp+dword*21]
+|.define TMP1,		dword [rsp+dword*20]
+|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by interpreter
+|.define SAVE_RET,	aword [rsp+aword*9]	//<-- rsp entering interpreter.
+|.define SAVE_R4,	aword [rsp+aword*8]
+|.define SAVE_R3,	aword [rsp+aword*7]
+|.define SAVE_R2,	aword [rsp+aword*6]
+|.define SAVE_R1,	aword [rsp+aword*5]	//<-- rsp after register saves.
+|.define ARG5,		aword [rsp+aword*4]
+|.define CSAVE_4,	aword [rsp+aword*3]
+|.define CSAVE_3,	aword [rsp+aword*2]
+|.define CSAVE_2,	aword [rsp+aword*1]
+|.define CSAVE_1,	aword [rsp]		//<-- rsp while in interpreter.
+|//----- 16 byte aligned, ^^^ 32 byte register save area, owned by callee
+|
+|// TMPQ overlaps TMP1/TMP2. MULTRES overlaps TMP2 (and TMPQ).
+|.define TMPQ,		qword [rsp+aword*10]
+|.define MULTRES,	TMP2
+|.define TMPa,		ARG5
+|.define ARG5d,		dword [rsp+aword*4]
+|.define TMP3,		ARG5d
+|
+|//-----------------------------------------------------------------------
+|.else			// x64/POSIX stack layout
+|
+|.define CFRAME_SPACE,	aword*5			// Delta for rsp (see <--).
+|.macro saveregs_
+|  push rbx; push r15; push r14
+|.if NO_UNWIND
+|  push r13; push r12
+|.endif
+|  sub rsp, CFRAME_SPACE
+|.endmacro
+|.macro saveregs
+|  push rbp; saveregs_
+|.endmacro
+|.macro restoreregs
+|  add rsp, CFRAME_SPACE
+|.if NO_UNWIND
+|  pop r12; pop r13
+|.endif
+|  pop r14; pop r15; pop rbx; pop rbp
+|.endmacro
+|
+|//----- 16 byte aligned,
+|.if NO_UNWIND
+|.define SAVE_RET,	aword [rsp+aword*11]	//<-- rsp entering interpreter.
+|.define SAVE_R4,	aword [rsp+aword*10]
+|.define SAVE_R3,	aword [rsp+aword*9]
+|.define SAVE_R2,	aword [rsp+aword*8]
+|.define SAVE_R1,	aword [rsp+aword*7]
+|.define SAVE_RU2,	aword [rsp+aword*6]
+|.define SAVE_RU1,	aword [rsp+aword*5]	//<-- rsp after register saves.
+|.else
+|.define SAVE_RET,	aword [rsp+aword*9]	//<-- rsp entering interpreter.
+|.define SAVE_R4,	aword [rsp+aword*8]
+|.define SAVE_R3,	aword [rsp+aword*7]
+|.define SAVE_R2,	aword [rsp+aword*6]
+|.define SAVE_R1,	aword [rsp+aword*5]	//<-- rsp after register saves.
+|.endif
+|.define SAVE_CFRAME,	aword [rsp+aword*4]
+|.define SAVE_PC,	dword [rsp+dword*7]
+|.define SAVE_L,	dword [rsp+dword*6]
+|.define SAVE_ERRF,	dword [rsp+dword*5]
+|.define SAVE_NRES,	dword [rsp+dword*4]
+|.define TMPa,		aword [rsp+aword*1]
+|.define TMP2,		dword [rsp+dword*1]
+|.define TMP1,		dword [rsp]		//<-- rsp while in interpreter.
+|//----- 16 byte aligned
+|
+|// TMPQ overlaps TMP1/TMP2. MULTRES overlaps TMP2 (and TMPQ).
+|.define TMPQ,		qword [rsp]
+|.define TMP3,		dword [rsp+aword*1]
+|.define MULTRES,	TMP2
+|
+|.endif
+|
+|//-----------------------------------------------------------------------
+|
+|// Instruction headers.
+|.macro ins_A; .endmacro
+|.macro ins_AD; .endmacro
+|.macro ins_AJ; .endmacro
+|.macro ins_ABC; movzx RB, RCH; movzx RC, RCL; .endmacro
+|.macro ins_AB_; movzx RB, RCH; .endmacro
+|.macro ins_A_C; movzx RC, RCL; .endmacro
+|.macro ins_AND; not RDa; .endmacro
+|
+|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
+|.macro ins_NEXT
+|  mov RC, [PC]
+|  movzx RA, RCH
+|  movzx OP, RCL
+|  add PC, 4
+|  shr RC, 16
+|.if X64
+|  jmp aword [DISPATCH+OP*8]
+|.else
+|  jmp aword [DISPATCH+OP*4]
+|.endif
+|.endmacro
+|
+|// Instruction footer.
+|.if 1
+|  // Replicated dispatch. Less unpredictable branches, but higher I-Cache use.
+|  .define ins_next, ins_NEXT
+|  .define ins_next_, ins_NEXT
+|.else
+|  // Common dispatch. Lower I-Cache use, only one (very) unpredictable branch.
+|  // Affects only certain kinds of benchmarks (and only with -j off).
+|  // Around 10%-30% slower on Core2, a lot more slower on P4.
+|  .macro ins_next
+|    jmp ->ins_next
+|  .endmacro
+|  .macro ins_next_
+|  ->ins_next:
+|    ins_NEXT
+|  .endmacro
+|.endif
+|
+|// Call decode and dispatch.
+|.macro ins_callt
+|  // BASE = new base, RB = LFUNC, RD = nargs+1, [BASE-4] = PC
+|  mov PC, LFUNC:RB->pc
+|  mov RA, [PC]
+|  movzx OP, RAL
+|  movzx RA, RAH
+|  add PC, 4
+|.if X64
+|  jmp aword [DISPATCH+OP*8]
+|.else
+|  jmp aword [DISPATCH+OP*4]
+|.endif
+|.endmacro
+|
+|.macro ins_call
+|  // BASE = new base, RB = LFUNC, RD = nargs+1
+|  mov [BASE-4], PC
+|  ins_callt
+|.endmacro
+|
+|//-----------------------------------------------------------------------
+|
+|// Macros to test operand types.
+|.macro checktp, reg, tp;  cmp dword [BASE+reg*8+4], tp; .endmacro
+|.macro checknum, reg, target; checktp reg, LJ_TISNUM; jae target; .endmacro
+|.macro checkint, reg, target; checktp reg, LJ_TISNUM; jne target; .endmacro
+|.macro checkstr, reg, target; checktp reg, LJ_TSTR; jne target; .endmacro
+|.macro checktab, reg, target; checktp reg, LJ_TTAB; jne target; .endmacro
+|
+|// These operands must be used with movzx.
+|.define PC_OP, byte [PC-4]
+|.define PC_RA, byte [PC-3]
+|.define PC_RB, byte [PC-1]
+|.define PC_RC, byte [PC-2]
+|.define PC_RD, word [PC-2]
+|
+|.macro branchPC, reg
+|  lea PC, [PC+reg*4-BCBIAS_J*4]
+|.endmacro
+|
+|// Assumes DISPATCH is relative to GL.
+#define DISPATCH_GL(field)	(GG_DISP2G + (int)offsetof(global_State, field))
+#define DISPATCH_J(field)	(GG_DISP2J + (int)offsetof(jit_State, field))
+|
+#define PC2PROTO(field)  ((int)offsetof(GCproto, field)-(int)sizeof(GCproto))
+|
+|// Decrement hashed hotcount and trigger trace recorder if zero.
+|.macro hotloop, reg
+|  mov reg, PC
+|  shr reg, 1
+|  and reg, HOTCOUNT_PCMASK
+|  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
+|  jb ->vm_hotloop
+|.endmacro
+|
+|.macro hotcall, reg
+|  mov reg, PC
+|  shr reg, 1
+|  and reg, HOTCOUNT_PCMASK
+|  sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_CALL
+|  jb ->vm_hotcall
+|.endmacro
+|
+|// Set current VM state.
+|.macro set_vmstate, st
+|  mov dword [DISPATCH+DISPATCH_GL(vmstate)], ~LJ_VMST_..st
+|.endmacro
+|
+|// x87 compares.
+|.macro fcomparepp			// Compare and pop st0 >< st1.
+|  fucomip st1
+|  fpop
+|.endmacro
+|
+|.macro fpop1; fstp st1; .endmacro
+|
+|// Synthesize SSE FP constants.
+|.macro sseconst_abs, reg, tmp		// Synthesize abs mask.
+|.if X64
+|  mov64 tmp, U64x(7fffffff,ffffffff); movd reg, tmp
+|.else
+|  pxor reg, reg; pcmpeqd reg, reg; psrlq reg, 1
+|.endif
+|.endmacro
+|
+|.macro sseconst_hi, reg, tmp, val	// Synthesize hi-32 bit const.
+|.if X64
+|  mov64 tmp, U64x(val,00000000); movd reg, tmp
+|.else
+|  mov tmp, 0x .. val; movd reg, tmp; pshufd reg, reg, 0x51
+|.endif
+|.endmacro
+|
+|.macro sseconst_sign, reg, tmp		// Synthesize sign mask.
+|  sseconst_hi reg, tmp, 80000000
+|.endmacro
+|.macro sseconst_1, reg, tmp		// Synthesize 1.0.
+|  sseconst_hi reg, tmp, 3ff00000
+|.endmacro
+|.macro sseconst_2p52, reg, tmp		// Synthesize 2^52.
+|  sseconst_hi reg, tmp, 43300000
+|.endmacro
+|.macro sseconst_tobit, reg, tmp	// Synthesize 2^52 + 2^51.
+|  sseconst_hi reg, tmp, 43380000
+|.endmacro
+|
+|// Move table write barrier back. Overwrites reg.
+|.macro barrierback, tab, reg
+|  and byte tab->marked, (uint8_t)~LJ_GC_BLACK	// black2gray(tab)
+|  mov reg, [DISPATCH+DISPATCH_GL(gc.grayagain)]
+|  mov [DISPATCH+DISPATCH_GL(gc.grayagain)], tab
+|  mov tab->gclist, reg
+|.endmacro
+|
+|//-----------------------------------------------------------------------
+
+/* Generate subroutines used by opcodes and other parts of the VM. */
+/* The .code_sub section should be last to help static branch prediction. */
+static void build_subroutines(BuildCtx *ctx)
+{
+  |.code_sub
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Return handling ----------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |->vm_returnp:
+  |  test PC, FRAME_P
+  |  jz ->cont_dispatch
+  |
+  |  // Return from pcall or xpcall fast func.
+  |  and PC, -8
+  |  sub BASE, PC			// Restore caller base.
+  |  lea RAa, [RA+PC-8]			// Rebase RA and prepend one result.
+  |  mov PC, [BASE-4]			// Fetch PC of previous frame.
+  |  // Prepending may overwrite the pcall frame, so do it at the end.
+  |  mov dword [BASE+RA+4], LJ_TTRUE	// Prepend true to results.
+  |
+  |->vm_returnc:
+  |  add RD, 1				// RD = nresults+1
+  |  jz ->vm_unwind_yield
+  |  mov MULTRES, RD
+  |  test PC, FRAME_TYPE
+  |  jz ->BC_RET_Z			// Handle regular return to Lua.
+  |
+  |->vm_return:
+  |  // BASE = base, RA = resultofs, RD = nresults+1 (= MULTRES), PC = return
+  |  xor PC, FRAME_C
+  |  test PC, FRAME_TYPE
+  |  jnz ->vm_returnp
+  |
+  |  // Return to C.
+  |  set_vmstate C
+  |  and PC, -8
+  |  sub PC, BASE
+  |  neg PC				// Previous base = BASE - delta.
+  |
+  |  sub RD, 1
+  |  jz >2
+  |1:  // Move results down.
+  |.if X64
+  |  mov RBa, [BASE+RA]
+  |  mov [BASE-8], RBa
+  |.else
+  |  mov RB, [BASE+RA]
+  |  mov [BASE-8], RB
+  |  mov RB, [BASE+RA+4]
+  |  mov [BASE-4], RB
+  |.endif
+  |  add BASE, 8
+  |  sub RD, 1
+  |  jnz <1
+  |2:
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, PC
+  |3:
+  |  mov RD, MULTRES
+  |  mov RA, SAVE_NRES			// RA = wanted nresults+1
+  |4:
+  |  cmp RA, RD
+  |  jne >6				// More/less results wanted?
+  |5:
+  |  sub BASE, 8
+  |  mov L:RB->top, BASE
+  |
+  |->vm_leave_cp:
+  |  mov RAa, SAVE_CFRAME		// Restore previous C frame.
+  |  mov L:RB->cframe, RAa
+  |  xor eax, eax			// Ok return status for vm_pcall.
+  |
+  |->vm_leave_unw:
+  |  restoreregs
+  |  ret
+  |
+  |6:
+  |  jb >7				// Less results wanted?
+  |  // More results wanted. Check stack size and fill up results with nil.
+  |  cmp BASE, L:RB->maxstack
+  |  ja >8
+  |  mov dword [BASE-4], LJ_TNIL
+  |  add BASE, 8
+  |  add RD, 1
+  |  jmp <4
+  |
+  |7:  // Less results wanted.
+  |  test RA, RA
+  |  jz <5				// But check for LUA_MULTRET+1.
+  |  sub RA, RD				// Negative result!
+  |  lea BASE, [BASE+RA*8]		// Correct top.
+  |  jmp <5
+  |
+  |8:  // Corner case: need to grow stack for filling up results.
+  |  // This can happen if:
+  |  // - A C function grows the stack (a lot).
+  |  // - The GC shrinks the stack in between.
+  |  // - A return back from a lua_call() with (high) nresults adjustment.
+  |  mov L:RB->top, BASE		// Save current top held in BASE (yes).
+  |  mov MULTRES, RD			// Need to fill only remainder with nil.
+  |  mov FCARG2, RA
+  |  mov FCARG1, L:RB
+  |  call extern lj_state_growstack@8	// (lua_State *L, int n)
+  |  mov BASE, L:RB->top		// Need the (realloced) L->top in BASE.
+  |  jmp <3
+  |
+  |->vm_unwind_yield:
+  |  mov al, LUA_YIELD
+  |  jmp ->vm_unwind_c_eh
+  |
+  |->vm_unwind_c@8:			// Unwind C stack, return from vm_pcall.
+  |  // (void *cframe, int errcode)
+  |.if X64
+  |  mov eax, CARG2d			// Error return status for vm_pcall.
+  |  mov rsp, CARG1
+  |.else
+  |  mov eax, FCARG2			// Error return status for vm_pcall.
+  |  mov esp, FCARG1
+  |.if WIN
+  |  lea FCARG1, SEH_NEXT
+  |  fs; mov [0], FCARG1
+  |.endif
+  |.endif
+  |->vm_unwind_c_eh:			// Landing pad for external unwinder.
+  |  mov L:RB, SAVE_L
+  |  mov GL:RB, L:RB->glref
+  |  mov dword GL:RB->vmstate, ~LJ_VMST_C
+  |  jmp ->vm_leave_unw
+  |
+  |->vm_unwind_rethrow:
+  |.if X64 and not X64WIN
+  |  mov FCARG1, SAVE_L
+  |  mov FCARG2, eax
+  |  restoreregs
+  |  jmp extern lj_err_throw@8		// (lua_State *L, int errcode)
+  |.endif
+  |
+  |->vm_unwind_ff@4:			// Unwind C stack, return from ff pcall.
+  |  // (void *cframe)
+  |.if X64
+  |  and CARG1, CFRAME_RAWMASK
+  |  mov rsp, CARG1
+  |.else
+  |  and FCARG1, CFRAME_RAWMASK
+  |  mov esp, FCARG1
+  |.if WIN
+  |  lea FCARG1, SEH_NEXT
+  |  fs; mov [0], FCARG1
+  |.endif
+  |.endif
+  |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
+  |  mov L:RB, SAVE_L
+  |  mov RAa, -8			// Results start at BASE+RA = BASE-8.
+  |  mov RD, 1+1			// Really 1+2 results, incr. later.
+  |  mov BASE, L:RB->base
+  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
+  |  add DISPATCH, GG_G2DISP
+  |  mov PC, [BASE-4]			// Fetch PC of previous frame.
+  |  mov dword [BASE-4], LJ_TFALSE	// Prepend false to error message.
+  |  set_vmstate INTERP
+  |  jmp ->vm_returnc			// Increments RD/MULTRES and returns.
+  |
+  |.if WIN and not X64
+  |->vm_rtlunwind@16:			// Thin layer around RtlUnwind.
+  |  // (void *cframe, void *excptrec, void *unwinder, int errcode)
+  |  mov [esp], FCARG1			// Return value for RtlUnwind.
+  |  push FCARG2			// Exception record for RtlUnwind.
+  |  push 0				// Ignored by RtlUnwind.
+  |  push dword [FCARG1+CFRAME_OFS_SEH]
+  |  call extern RtlUnwind@16		// Violates ABI (clobbers too much).
+  |  mov FCARG1, eax
+  |  mov FCARG2, [esp+4]		// errcode (for vm_unwind_c).
+  |  ret				// Jump to unwinder.
+  |.endif
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Grow stack for calls -----------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |->vm_growstack_c:			// Grow stack for C function.
+  |  mov FCARG2, LUA_MINSTACK
+  |  jmp >2
+  |
+  |->vm_growstack_v:			// Grow stack for vararg Lua function.
+  |  sub RD, 8
+  |  jmp >1
+  |
+  |->vm_growstack_f:			// Grow stack for fixarg Lua function.
+  |  // BASE = new base, RD = nargs+1, RB = L, PC = first PC
+  |  lea RD, [BASE+NARGS:RD*8-8]
+  |1:
+  |  movzx RA, byte [PC-4+PC2PROTO(framesize)]
+  |  add PC, 4				// Must point after first instruction.
+  |  mov L:RB->base, BASE
+  |  mov L:RB->top, RD
+  |  mov SAVE_PC, PC
+  |  mov FCARG2, RA
+  |2:
+  |  // RB = L, L->base = new base, L->top = top
+  |  mov FCARG1, L:RB
+  |  call extern lj_state_growstack@8	// (lua_State *L, int n)
+  |  mov BASE, L:RB->base
+  |  mov RD, L:RB->top
+  |  mov LFUNC:RB, [BASE-8]
+  |  sub RD, BASE
+  |  shr RD, 3
+  |  add NARGS:RD, 1
+  |  // BASE = new base, RB = LFUNC, RD = nargs+1
+  |  ins_callt				// Just retry the call.
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Entry points into the assembler VM ---------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |->vm_resume:				// Setup C frame and resume thread.
+  |  // (lua_State *L, TValue *base, int nres1 = 0, ptrdiff_t ef = 0)
+  |  saveregs
+  |.if X64
+  |  mov L:RB, CARG1d			// Caveat: CARG1d may be RA.
+  |  mov SAVE_L, CARG1d
+  |  mov RA, CARG2d
+  |.else
+  |  mov L:RB, SAVE_L
+  |  mov RA, INARG_BASE			// Caveat: overlaps SAVE_CFRAME!
+  |.endif
+  |  mov PC, FRAME_CP
+  |  xor RD, RD
+  |  lea KBASEa, [esp+CFRAME_RESUME]
+  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
+  |  add DISPATCH, GG_G2DISP
+  |  mov SAVE_PC, RD			// Any value outside of bytecode is ok.
+  |  mov SAVE_CFRAME, RDa
+  |.if X64
+  |  mov SAVE_NRES, RD
+  |  mov SAVE_ERRF, RD
+  |.endif
+  |  mov L:RB->cframe, KBASEa
+  |  cmp byte L:RB->status, RDL
+  |  je >2				// Initial resume (like a call).
+  |
+  |  // Resume after yield (like a return).
+  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+  |  set_vmstate INTERP
+  |  mov byte L:RB->status, RDL
+  |  mov BASE, L:RB->base
+  |  mov RD, L:RB->top
+  |  sub RD, RA
+  |  shr RD, 3
+  |  add RD, 1				// RD = nresults+1
+  |  sub RA, BASE			// RA = resultofs
+  |  mov PC, [BASE-4]
+  |  mov MULTRES, RD
+  |  test PC, FRAME_TYPE
+  |  jz ->BC_RET_Z
+  |  jmp ->vm_return
+  |
+  |->vm_pcall:				// Setup protected C frame and enter VM.
+  |  // (lua_State *L, TValue *base, int nres1, ptrdiff_t ef)
+  |  saveregs
+  |  mov PC, FRAME_CP
+  |.if X64
+  |  mov SAVE_ERRF, CARG4d
+  |.endif
+  |  jmp >1
+  |
+  |->vm_call:				// Setup C frame and enter VM.
+  |  // (lua_State *L, TValue *base, int nres1)
+  |  saveregs
+  |  mov PC, FRAME_C
+  |
+  |1:  // Entry point for vm_pcall above (PC = ftype).
+  |.if X64
+  |  mov SAVE_NRES, CARG3d
+  |  mov L:RB, CARG1d			// Caveat: CARG1d may be RA.
+  |  mov SAVE_L, CARG1d
+  |  mov RA, CARG2d
+  |.else
+  |  mov L:RB, SAVE_L
+  |  mov RA, INARG_BASE			// Caveat: overlaps SAVE_CFRAME!
+  |.endif
+  |
+  |  mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
+  |  mov KBASEa, L:RB->cframe		// Add our C frame to cframe chain.
+  |  mov SAVE_CFRAME, KBASEa
+  |  mov SAVE_PC, L:RB			// Any value outside of bytecode is ok.
+  |  add DISPATCH, GG_G2DISP
+  |.if X64
+  |  mov L:RB->cframe, rsp
+  |.else
+  |  mov L:RB->cframe, esp
+  |.endif
+  |
+  |2:  // Entry point for vm_resume/vm_cpcall (RA = base, RB = L, PC = ftype).
+  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+  |  set_vmstate INTERP
+  |  mov BASE, L:RB->base		// BASE = old base (used in vmeta_call).
+  |  add PC, RA
+  |  sub PC, BASE			// PC = frame delta + frame type
+  |
+  |  mov RD, L:RB->top
+  |  sub RD, RA
+  |  shr NARGS:RD, 3
+  |  add NARGS:RD, 1			// RD = nargs+1
+  |
+  |->vm_call_dispatch:
+  |  mov LFUNC:RB, [RA-8]
+  |  cmp dword [RA-4], LJ_TFUNC
+  |  jne ->vmeta_call			// Ensure KBASE defined and != BASE.
+  |
+  |->vm_call_dispatch_f:
+  |  mov BASE, RA
+  |  ins_call
+  |  // BASE = new base, RB = func, RD = nargs+1, PC = caller PC
+  |
+  |->vm_cpcall:				// Setup protected C frame, call C.
+  |  // (lua_State *L, lua_CFunction func, void *ud, lua_CPFunction cp)
+  |  saveregs
+  |.if X64
+  |  mov L:RB, CARG1d			// Caveat: CARG1d may be RA.
+  |  mov SAVE_L, CARG1d
+  |.else
+  |  mov L:RB, SAVE_L
+  |  // Caveat: INARG_CP_* and SAVE_CFRAME/SAVE_NRES/SAVE_ERRF overlap!
+  |  mov RC, INARG_CP_UD		// Get args before they are overwritten.
+  |  mov RA, INARG_CP_FUNC
+  |  mov BASE, INARG_CP_CALL
+  |.endif
+  |  mov SAVE_PC, L:RB			// Any value outside of bytecode is ok.
+  |
+  |  mov KBASE, L:RB->stack		// Compute -savestack(L, L->top).
+  |  sub KBASE, L:RB->top
+  |   mov DISPATCH, L:RB->glref		// Setup pointer to dispatch table.
+  |  mov SAVE_ERRF, 0			// No error function.
+  |  mov SAVE_NRES, KBASE		// Neg. delta means cframe w/o frame.
+  |   add DISPATCH, GG_G2DISP
+  |  // Handler may change cframe_nres(L->cframe) or cframe_errfunc(L->cframe).
+  |
+  |.if X64
+  |  mov KBASEa, L:RB->cframe		// Add our C frame to cframe chain.
+  |  mov SAVE_CFRAME, KBASEa
+  |  mov L:RB->cframe, rsp
+  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+  |
+  |  call CARG4			// (lua_State *L, lua_CFunction func, void *ud)
+  |.else
+  |  mov ARG3, RC			// Have to copy args downwards.
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |
+  |  mov KBASE, L:RB->cframe		// Add our C frame to cframe chain.
+  |  mov SAVE_CFRAME, KBASE
+  |  mov L:RB->cframe, esp
+  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+  |
+  |  call BASE			// (lua_State *L, lua_CFunction func, void *ud)
+  |.endif
+  |  // TValue * (new base) or NULL returned in eax (RC).
+  |  test RC, RC
+  |  jz ->vm_leave_cp			// No base? Just remove C frame.
+  |  mov RA, RC
+  |  mov PC, FRAME_CP
+  |  jmp <2				// Else continue with the call.
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Metamethod handling ------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |//-- Continuation dispatch ----------------------------------------------
+  |
+  |->cont_dispatch:
+  |  // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
+  |  add RA, BASE
+  |  and PC, -8
+  |  mov RB, BASE
+  |  sub BASE, PC			// Restore caller BASE.
+  |  mov dword [RA+RD*8-4], LJ_TNIL	// Ensure one valid arg.
+  |  mov RC, RA				// ... in [RC]
+  |  mov PC, [RB-12]			// Restore PC from [cont|PC].
+  |.if X64
+  |  movsxd RAa, dword [RB-16]		// May be negative on WIN64 with debug.
+  |.if FFI
+  |  cmp RA, 1
+  |  jbe >1
+  |.endif
+  |  lea KBASEa, qword [=>0]
+  |  add RAa, KBASEa
+  |.else
+  |  mov RA, dword [RB-16]
+  |.if FFI
+  |  cmp RA, 1
+  |  jbe >1
+  |.endif
+  |.endif
+  |  mov LFUNC:KBASE, [BASE-8]
+  |  mov KBASE, LFUNC:KBASE->pc
+  |  mov KBASE, [KBASE+PC2PROTO(k)]
+  |  // BASE = base, RC = result, RB = meta base
+  |  jmp RAa				// Jump to continuation.
+  |
+  |.if FFI
+  |1:
+  |  je ->cont_ffi_callback		// cont = 1: return from FFI callback.
+  |  // cont = 0: Tail call from C function.
+  |  sub RB, BASE
+  |  shr RB, 3
+  |  lea RD, [RB-1]
+  |  jmp ->vm_call_tail
+  |.endif
+  |
+  |->cont_cat:				// BASE = base, RC = result, RB = mbase
+  |  movzx RA, PC_RB
+  |  sub RB, 16
+  |  lea RA, [BASE+RA*8]
+  |  sub RA, RB
+  |  je ->cont_ra
+  |  neg RA
+  |  shr RA, 3
+  |.if X64WIN
+  |  mov CARG3d, RA
+  |  mov L:CARG1d, SAVE_L
+  |  mov L:CARG1d->base, BASE
+  |  mov RCa, [RC]
+  |  mov [RB], RCa
+  |  mov CARG2d, RB
+  |.elif X64
+  |  mov L:CARG1d, SAVE_L
+  |  mov L:CARG1d->base, BASE
+  |  mov CARG3d, RA
+  |  mov RAa, [RC]
+  |  mov [RB], RAa
+  |  mov CARG2d, RB
+  |.else
+  |  mov ARG3, RA
+  |  mov RA, [RC+4]
+  |  mov RC, [RC]
+  |  mov [RB+4], RA
+  |  mov [RB], RC
+  |  mov ARG2, RB
+  |.endif
+  |  jmp ->BC_CAT_Z
+  |
+  |//-- Table indexing metamethods -----------------------------------------
+  |
+  |->vmeta_tgets:
+  |  mov TMP1, RC			// RC = GCstr *
+  |  mov TMP2, LJ_TSTR
+  |  lea RCa, TMP1			// Store temp. TValue in TMP1/TMP2.
+  |  cmp PC_OP, BC_GGET
+  |  jne >1
+  |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
+  |  mov [RA], TAB:RB			// RB = GCtab *
+  |  mov dword [RA+4], LJ_TTAB
+  |  mov RB, RA
+  |  jmp >2
+  |
+  |->vmeta_tgetb:
+  |  movzx RC, PC_RC
+  |.if DUALNUM
+  |  mov TMP2, LJ_TISNUM
+  |  mov TMP1, RC
+  |.else
+  |  cvtsi2sd xmm0, RC
+  |  movsd TMPQ, xmm0
+  |.endif
+  |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
+  |  jmp >1
+  |
+  |->vmeta_tgetv:
+  |  movzx RC, PC_RC			// Reload TValue *k from RC.
+  |  lea RC, [BASE+RC*8]
+  |1:
+  |  movzx RB, PC_RB			// Reload TValue *t from RB.
+  |  lea RB, [BASE+RB*8]
+  |2:
+  |.if X64
+  |  mov L:CARG1d, SAVE_L
+  |  mov L:CARG1d->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
+  |  mov CARG2d, RB
+  |  mov CARG3, RCa			// May be 64 bit ptr to stack.
+  |  mov L:RB, L:CARG1d
+  |.else
+  |  mov ARG2, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RC
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, BASE
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_tget		// (lua_State *L, TValue *o, TValue *k)
+  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+  |  test RC, RC
+  |  jz >3
+  |->cont_ra:				// BASE = base, RC = result
+  |  movzx RA, PC_RA
+  |.if X64
+  |  mov RBa, [RC]
+  |  mov [BASE+RA*8], RBa
+  |.else
+  |  mov RB, [RC+4]
+  |  mov RC, [RC]
+  |  mov [BASE+RA*8+4], RB
+  |  mov [BASE+RA*8], RC
+  |.endif
+  |  ins_next
+  |
+  |3:  // Call __index metamethod.
+  |  // BASE = base, L->top = new base, stack = cont/func/t/k
+  |  mov RA, L:RB->top
+  |  mov [RA-12], PC			// [cont|PC]
+  |  lea PC, [RA+FRAME_CONT]
+  |  sub PC, BASE
+  |  mov LFUNC:RB, [RA-8]		// Guaranteed to be a function here.
+  |  mov NARGS:RD, 2+1			// 2 args for func(t, k).
+  |  jmp ->vm_call_dispatch_f
+  |
+  |->vmeta_tgetr:
+  |  mov FCARG1, TAB:RB
+  |  mov RB, BASE			// Save BASE.
+  |  mov FCARG2, RC			// Caveat: FCARG2 == BASE
+  |  call extern lj_tab_getinth@8	// (GCtab *t, int32_t key)
+  |  // cTValue * or NULL returned in eax (RC).
+  |  movzx RA, PC_RA
+  |  mov BASE, RB			// Restore BASE.
+  |  test RC, RC
+  |  jnz ->BC_TGETR_Z
+  |  mov dword [BASE+RA*8+4], LJ_TNIL
+  |  jmp ->BC_TGETR2_Z
+  |
+  |//-----------------------------------------------------------------------
+  |
+  |->vmeta_tsets:
+  |  mov TMP1, RC			// RC = GCstr *
+  |  mov TMP2, LJ_TSTR
+  |  lea RCa, TMP1			// Store temp. TValue in TMP1/TMP2.
+  |  cmp PC_OP, BC_GSET
+  |  jne >1
+  |  lea RA, [DISPATCH+DISPATCH_GL(tmptv)]  // Store fn->l.env in g->tmptv.
+  |  mov [RA], TAB:RB			// RB = GCtab *
+  |  mov dword [RA+4], LJ_TTAB
+  |  mov RB, RA
+  |  jmp >2
+  |
+  |->vmeta_tsetb:
+  |  movzx RC, PC_RC
+  |.if DUALNUM
+  |  mov TMP2, LJ_TISNUM
+  |  mov TMP1, RC
+  |.else
+  |  cvtsi2sd xmm0, RC
+  |  movsd TMPQ, xmm0
+  |.endif
+  |  lea RCa, TMPQ			// Store temp. TValue in TMPQ.
+  |  jmp >1
+  |
+  |->vmeta_tsetv:
+  |  movzx RC, PC_RC			// Reload TValue *k from RC.
+  |  lea RC, [BASE+RC*8]
+  |1:
+  |  movzx RB, PC_RB			// Reload TValue *t from RB.
+  |  lea RB, [BASE+RB*8]
+  |2:
+  |.if X64
+  |  mov L:CARG1d, SAVE_L
+  |  mov L:CARG1d->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
+  |  mov CARG2d, RB
+  |  mov CARG3, RCa			// May be 64 bit ptr to stack.
+  |  mov L:RB, L:CARG1d
+  |.else
+  |  mov ARG2, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RC
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, BASE
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_tset		// (lua_State *L, TValue *o, TValue *k)
+  |  // TValue * (finished) or NULL (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+  |  test RC, RC
+  |  jz >3
+  |  // NOBARRIER: lj_meta_tset ensures the table is not black.
+  |  movzx RA, PC_RA
+  |.if X64
+  |  mov RBa, [BASE+RA*8]
+  |  mov [RC], RBa
+  |.else
+  |  mov RB, [BASE+RA*8+4]
+  |  mov RA, [BASE+RA*8]
+  |  mov [RC+4], RB
+  |  mov [RC], RA
+  |.endif
+  |->cont_nop:				// BASE = base, (RC = result)
+  |  ins_next
+  |
+  |3:  // Call __newindex metamethod.
+  |  // BASE = base, L->top = new base, stack = cont/func/t/k/(v)
+  |  mov RA, L:RB->top
+  |  mov [RA-12], PC			// [cont|PC]
+  |  movzx RC, PC_RA
+  |  // Copy value to third argument.
+  |.if X64
+  |  mov RBa, [BASE+RC*8]
+  |  mov [RA+16], RBa
+  |.else
+  |  mov RB, [BASE+RC*8+4]
+  |  mov RC, [BASE+RC*8]
+  |  mov [RA+20], RB
+  |  mov [RA+16], RC
+  |.endif
+  |  lea PC, [RA+FRAME_CONT]
+  |  sub PC, BASE
+  |  mov LFUNC:RB, [RA-8]		// Guaranteed to be a function here.
+  |  mov NARGS:RD, 3+1			// 3 args for func(t, k, v).
+  |  jmp ->vm_call_dispatch_f
+  |
+  |->vmeta_tsetr:
+  |.if X64WIN
+  |  mov L:CARG1d, SAVE_L
+  |  mov CARG3d, RC
+  |  mov L:CARG1d->base, BASE
+  |  xchg CARG2d, TAB:RB		// Caveat: CARG2d == BASE.
+  |.elif X64
+  |  mov L:CARG1d, SAVE_L
+  |  mov CARG2d, TAB:RB
+  |  mov L:CARG1d->base, BASE
+  |  mov RB, BASE			// Save BASE.
+  |  mov CARG3d, RC			// Caveat: CARG3d == BASE.
+  |.else
+  |  mov L:RA, SAVE_L
+  |  mov ARG2, TAB:RB
+  |  mov RB, BASE			// Save BASE.
+  |  mov ARG3, RC
+  |  mov ARG1, L:RA
+  |  mov L:RA->base, BASE
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_tab_setinth  // (lua_State *L, GCtab *t, int32_t key)
+  |  // TValue * returned in eax (RC).
+  |  movzx RA, PC_RA
+  |  mov BASE, RB			// Restore BASE.
+  |  jmp ->BC_TSETR_Z
+  |
+  |//-- Comparison metamethods ---------------------------------------------
+  |
+  |->vmeta_comp:
+  |.if X64
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d == BASE.
+  |.if X64WIN
+  |  lea CARG3d, [BASE+RD*8]
+  |  lea CARG2d, [BASE+RA*8]
+  |.else
+  |  lea CARG2d, [BASE+RA*8]
+  |  lea CARG3d, [BASE+RD*8]
+  |.endif
+  |  mov CARG1d, L:RB			// Caveat: CARG1d/CARG4d == RA.
+  |  movzx CARG4d, PC_OP
+  |.else
+  |  movzx RB, PC_OP
+  |  lea RD, [BASE+RD*8]
+  |  lea RA, [BASE+RA*8]
+  |  mov ARG4, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RD
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, BASE
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_comp	// (lua_State *L, TValue *o1, *o2, int op)
+  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
+  |3:
+  |  mov BASE, L:RB->base
+  |  cmp RC, 1
+  |  ja ->vmeta_binop
+  |4:
+  |  lea PC, [PC+4]
+  |  jb >6
+  |5:
+  |  movzx RD, PC_RD
+  |  branchPC RD
+  |6:
+  |  ins_next
+  |
+  |->cont_condt:			// BASE = base, RC = result
+  |  add PC, 4
+  |  cmp dword [RC+4], LJ_TISTRUECOND	// Branch if result is true.
+  |  jb <5
+  |  jmp <6
+  |
+  |->cont_condf:			// BASE = base, RC = result
+  |  cmp dword [RC+4], LJ_TISTRUECOND	// Branch if result is false.
+  |  jmp <4
+  |
+  |->vmeta_equal:
+  |  sub PC, 4
+  |.if X64WIN
+  |  mov CARG3d, RD
+  |  mov CARG4d, RB
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE		// Caveat: CARG2d == BASE.
+  |  mov CARG2d, RA
+  |  mov CARG1d, L:RB			// Caveat: CARG1d == RA.
+  |.elif X64
+  |  mov CARG2d, RA
+  |  mov CARG4d, RB			// Caveat: CARG4d == RA.
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE		// Caveat: CARG3d == BASE.
+  |  mov CARG3d, RD
+  |  mov CARG1d, L:RB
+  |.else
+  |  mov ARG4, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RD
+  |  mov ARG2, RA
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, BASE
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_equal	// (lua_State *L, GCobj *o1, *o2, int ne)
+  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
+  |  jmp <3
+  |
+  |->vmeta_equal_cd:
+  |.if FFI
+  |  sub PC, 4
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov FCARG1, L:RB
+  |  mov FCARG2, dword [PC-4]
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_equal_cd@8	// (lua_State *L, BCIns ins)
+  |  // 0/1 or TValue * (metamethod) returned in eax (RC).
+  |  jmp <3
+  |.endif
+  |
+  |->vmeta_istype:
+  |.if X64
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
+  |  mov CARG2d, RA
+  |  movzx CARG3d, PC_RD
+  |  mov L:CARG1d, L:RB
+  |.else
+  |  movzx RD, PC_RD
+  |  mov ARG2, RA
+  |  mov L:RB, SAVE_L
+  |  mov ARG3, RD
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, BASE
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_istype  // (lua_State *L, BCReg ra, BCReg tp)
+  |  mov BASE, L:RB->base
+  |  jmp <6
+  |
+  |//-- Arithmetic metamethods ---------------------------------------------
+  |
+  |->vmeta_arith_vno:
+  |.if DUALNUM
+  |  movzx RB, PC_RB
+  |.endif
+  |->vmeta_arith_vn:
+  |  lea RC, [KBASE+RC*8]
+  |  jmp >1
+  |
+  |->vmeta_arith_nvo:
+  |.if DUALNUM
+  |  movzx RC, PC_RC
+  |.endif
+  |->vmeta_arith_nv:
+  |  lea RC, [KBASE+RC*8]
+  |  lea RB, [BASE+RB*8]
+  |  xchg RB, RC
+  |  jmp >2
+  |
+  |->vmeta_unm:
+  |  lea RC, [BASE+RD*8]
+  |  mov RB, RC
+  |  jmp >2
+  |
+  |->vmeta_arith_vvo:
+  |.if DUALNUM
+  |  movzx RB, PC_RB
+  |.endif
+  |->vmeta_arith_vv:
+  |  lea RC, [BASE+RC*8]
+  |1:
+  |  lea RB, [BASE+RB*8]
+  |2:
+  |  lea RA, [BASE+RA*8]
+  |.if X64WIN
+  |  mov CARG3d, RB
+  |  mov CARG4d, RC
+  |  movzx RC, PC_OP
+  |  mov ARG5d, RC
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE		// Caveat: CARG2d == BASE.
+  |  mov CARG2d, RA
+  |  mov CARG1d, L:RB			// Caveat: CARG1d == RA.
+  |.elif X64
+  |  movzx CARG5d, PC_OP
+  |  mov CARG2d, RA
+  |  mov CARG4d, RC			// Caveat: CARG4d == RA.
+  |  mov L:CARG1d, SAVE_L
+  |  mov L:CARG1d->base, BASE		// Caveat: CARG3d == BASE.
+  |  mov CARG3d, RB
+  |  mov L:RB, L:CARG1d
+  |.else
+  |  mov ARG3, RB
+  |  mov L:RB, SAVE_L
+  |  mov ARG4, RC
+  |  movzx RC, PC_OP
+  |  mov ARG2, RA
+  |  mov ARG5, RC
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, BASE
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_arith	// (lua_State *L, TValue *ra,*rb,*rc, BCReg op)
+  |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+  |  test RC, RC
+  |  jz ->cont_nop
+  |
+  |  // Call metamethod for binary op.
+  |->vmeta_binop:
+  |  // BASE = base, RC = new base, stack = cont/func/o1/o2
+  |  mov RA, RC
+  |  sub RC, BASE
+  |  mov [RA-12], PC			// [cont|PC]
+  |  lea PC, [RC+FRAME_CONT]
+  |  mov NARGS:RD, 2+1			// 2 args for func(o1, o2).
+  |  jmp ->vm_call_dispatch
+  |
+  |->vmeta_len:
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  lea FCARG2, [BASE+RD*8]		// Caveat: FCARG2 == BASE
+  |  mov L:FCARG1, L:RB
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_len@8		// (lua_State *L, TValue *o)
+  |  // NULL (retry) or TValue * (metamethod) returned in eax (RC).
+  |  mov BASE, L:RB->base
+#if LJ_52
+  |  test RC, RC
+  |  jne ->vmeta_binop			// Binop call for compatibility.
+  |  movzx RD, PC_RD
+  |  mov TAB:FCARG1, [BASE+RD*8]
+  |  jmp ->BC_LEN_Z
+#else
+  |  jmp ->vmeta_binop			// Binop call for compatibility.
+#endif
+  |
+  |//-- Call metamethod ----------------------------------------------------
+  |
+  |->vmeta_call_ra:
+  |  lea RA, [BASE+RA*8+8]
+  |->vmeta_call:			// Resolve and call __call metamethod.
+  |  // BASE = old base, RA = new base, RC = nargs+1, PC = return
+  |  mov TMP2, RA			// Save RA, RC for us.
+  |  mov TMP1, NARGS:RD
+  |  sub RA, 8
+  |.if X64
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
+  |  mov CARG2d, RA
+  |  lea CARG3d, [RA+NARGS:RD*8]
+  |  mov CARG1d, L:RB			// Caveat: CARG1d may be RA.
+  |.else
+  |  lea RC, [RA+NARGS:RD*8]
+  |  mov L:RB, SAVE_L
+  |  mov ARG2, RA
+  |  mov ARG3, RC
+  |  mov ARG1, L:RB
+  |  mov L:RB->base, BASE		// This is the callers base!
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_call	// (lua_State *L, TValue *func, TValue *top)
+  |  mov BASE, L:RB->base
+  |  mov RA, TMP2
+  |  mov NARGS:RD, TMP1
+  |  mov LFUNC:RB, [RA-8]
+  |  add NARGS:RD, 1
+  |  // This is fragile. L->base must not move, KBASE must always be defined.
+  |.if x64
+  |  cmp KBASEa, rdx			// Continue with CALLT if flag set.
+  |.else
+  |  cmp KBASE, BASE			// Continue with CALLT if flag set.
+  |.endif
+  |  je ->BC_CALLT_Z
+  |  mov BASE, RA
+  |  ins_call				// Otherwise call resolved metamethod.
+  |
+  |//-- Argument coercion for 'for' statement ------------------------------
+  |
+  |->vmeta_for:
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov FCARG2, RA			// Caveat: FCARG2 == BASE
+  |  mov L:FCARG1, L:RB			// Caveat: FCARG1 == RA
+  |  mov SAVE_PC, PC
+  |  call extern lj_meta_for@8	// (lua_State *L, TValue *base)
+  |  mov BASE, L:RB->base
+  |  mov RC, [PC-4]
+  |  movzx RA, RCH
+  |  movzx OP, RCL
+  |  shr RC, 16
+  |.if X64
+  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]	// Retry FORI or JFORI.
+  |.else
+  |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]	// Retry FORI or JFORI.
+  |.endif
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Fast functions -----------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |.macro .ffunc, name
+  |->ff_ .. name:
+  |.endmacro
+  |
+  |.macro .ffunc_1, name
+  |->ff_ .. name:
+  |  cmp NARGS:RD, 1+1;  jb ->fff_fallback
+  |.endmacro
+  |
+  |.macro .ffunc_2, name
+  |->ff_ .. name:
+  |  cmp NARGS:RD, 2+1;  jb ->fff_fallback
+  |.endmacro
+  |
+  |.macro .ffunc_nsse, name, op
+  |  .ffunc_1 name
+  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
+  |  op xmm0, qword [BASE]
+  |.endmacro
+  |
+  |.macro .ffunc_nsse, name
+  |  .ffunc_nsse name, movsd
+  |.endmacro
+  |
+  |.macro .ffunc_nnsse, name
+  |  .ffunc_2 name
+  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
+  |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
+  |  movsd xmm0, qword [BASE]
+  |  movsd xmm1, qword [BASE+8]
+  |.endmacro
+  |
+  |.macro .ffunc_nnr, name
+  |  .ffunc_2 name
+  |  cmp dword [BASE+4], LJ_TISNUM;  jae ->fff_fallback
+  |  cmp dword [BASE+12], LJ_TISNUM;  jae ->fff_fallback
+  |  fld qword [BASE+8]
+  |  fld qword [BASE]
+  |.endmacro
+  |
+  |// Inlined GC threshold check. Caveat: uses label 1.
+  |.macro ffgccheck
+  |  mov RB, [DISPATCH+DISPATCH_GL(gc.total)]
+  |  cmp RB, [DISPATCH+DISPATCH_GL(gc.threshold)]
+  |  jb >1
+  |  call ->fff_gcstep
+  |1:
+  |.endmacro
+  |
+  |//-- Base library: checks -----------------------------------------------
+  |
+  |.ffunc_1 assert
+  |  mov RB, [BASE+4]
+  |  cmp RB, LJ_TISTRUECOND;  jae ->fff_fallback
+  |  mov PC, [BASE-4]
+  |  mov MULTRES, RD
+  |  mov [BASE-4], RB
+  |  mov RB, [BASE]
+  |  mov [BASE-8], RB
+  |  sub RD, 2
+  |  jz >2
+  |  mov RA, BASE
+  |1:
+  |  add RA, 8
+  |.if X64
+  |  mov RBa, [RA]
+  |  mov [RA-8], RBa
+  |.else
+  |  mov RB, [RA+4]
+  |  mov [RA-4], RB
+  |  mov RB, [RA]
+  |  mov [RA-8], RB
+  |.endif
+  |  sub RD, 1
+  |  jnz <1
+  |2:
+  |  mov RD, MULTRES
+  |  jmp ->fff_res_
+  |
+  |.ffunc_1 type
+  |  mov RB, [BASE+4]
+  |.if X64
+  |  mov RA, RB
+  |  sar RA, 15
+  |  cmp RA, -2
+  |  je >3
+  |.endif
+  |  mov RC, ~LJ_TNUMX
+  |  not RB
+  |  cmp RC, RB
+  |  cmova RC, RB
+  |2:
+  |  mov CFUNC:RB, [BASE-8]
+  |  mov STR:RC, [CFUNC:RB+RC*8+((char *)(&((GCfuncC *)0)->upvalue))]
+  |  mov PC, [BASE-4]
+  |  mov dword [BASE-4], LJ_TSTR
+  |  mov [BASE-8], STR:RC
+  |  jmp ->fff_res1
+  |.if X64
+  |3:
+  |  mov RC, ~LJ_TLIGHTUD
+  |  jmp <2
+  |.endif
+  |
+  |//-- Base library: getters and setters ---------------------------------
+  |
+  |.ffunc_1 getmetatable
+  |  mov RB, [BASE+4]
+  |  mov PC, [BASE-4]
+  |  cmp RB, LJ_TTAB;  jne >6
+  |1:  // Field metatable must be at same offset for GCtab and GCudata!
+  |  mov TAB:RB, [BASE]
+  |  mov TAB:RB, TAB:RB->metatable
+  |2:
+  |  test TAB:RB, TAB:RB
+  |  mov dword [BASE-4], LJ_TNIL
+  |  jz ->fff_res1
+  |  mov STR:RC, [DISPATCH+DISPATCH_GL(gcroot)+4*(GCROOT_MMNAME+MM_metatable)]
+  |  mov dword [BASE-4], LJ_TTAB	// Store metatable as default result.
+  |  mov [BASE-8], TAB:RB
+  |  mov RA, TAB:RB->hmask
+  |  and RA, STR:RC->sid
+  |  imul RA, #NODE
+  |  add NODE:RA, TAB:RB->node
+  |3:  // Rearranged logic, because we expect _not_ to find the key.
+  |  cmp dword NODE:RA->key.it, LJ_TSTR
+  |  jne >4
+  |  cmp dword NODE:RA->key.gcr, STR:RC
+  |  je >5
+  |4:
+  |  mov NODE:RA, NODE:RA->next
+  |  test NODE:RA, NODE:RA
+  |  jnz <3
+  |  jmp ->fff_res1			// Not found, keep default result.
+  |5:
+  |  mov RB, [RA+4]
+  |  cmp RB, LJ_TNIL;  je ->fff_res1	// Ditto for nil value.
+  |  mov RC, [RA]
+  |  mov [BASE-4], RB			// Return value of mt.__metatable.
+  |  mov [BASE-8], RC
+  |  jmp ->fff_res1
+  |
+  |6:
+  |  cmp RB, LJ_TUDATA;  je <1
+  |.if X64
+  |  cmp RB, LJ_TNUMX;  ja >8
+  |  cmp RB, LJ_TISNUM;  jbe >7
+  |  mov RB, LJ_TLIGHTUD
+  |  jmp >8
+  |7:
+  |.else
+  |  cmp RB, LJ_TISNUM;  ja >8
+  |.endif
+  |  mov RB, LJ_TNUMX
+  |8:
+  |  not RB
+  |  mov TAB:RB, [DISPATCH+RB*4+DISPATCH_GL(gcroot[GCROOT_BASEMT])]
+  |  jmp <2
+  |
+  |.ffunc_2 setmetatable
+  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
+  |  // Fast path: no mt for table yet and not clearing the mt.
+  |  mov TAB:RB, [BASE]
+  |  cmp dword TAB:RB->metatable, 0;  jne ->fff_fallback
+  |  cmp dword [BASE+12], LJ_TTAB;  jne ->fff_fallback
+  |  mov TAB:RC, [BASE+8]
+  |  mov TAB:RB->metatable, TAB:RC
+  |  mov PC, [BASE-4]
+  |  mov dword [BASE-4], LJ_TTAB		// Return original table.
+  |  mov [BASE-8], TAB:RB
+  |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+  |  jz >1
+  |  // Possible write barrier. Table is black, but skip iswhite(mt) check.
+  |  barrierback TAB:RB, RC
+  |1:
+  |  jmp ->fff_res1
+  |
+  |.ffunc_2 rawget
+  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
+  |.if X64WIN
+  |  mov RB, BASE			// Save BASE.
+  |  lea CARG3d, [BASE+8]
+  |  mov CARG2d, [BASE]			// Caveat: CARG2d == BASE.
+  |  mov CARG1d, SAVE_L
+  |.elif X64
+  |  mov RB, BASE			// Save BASE.
+  |  mov CARG2d, [BASE]
+  |  lea CARG3d, [BASE+8]		// Caveat: CARG3d == BASE.
+  |  mov CARG1d, SAVE_L
+  |.else
+  |  mov TAB:RD, [BASE]
+  |  mov L:RB, SAVE_L
+  |  mov ARG2, TAB:RD
+  |  mov ARG1, L:RB
+  |  mov RB, BASE			// Save BASE.
+  |  add BASE, 8
+  |  mov ARG3, BASE
+  |.endif
+  |  call extern lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
+  |  // cTValue * returned in eax (RD).
+  |  mov BASE, RB			// Restore BASE.
+  |  // Copy table slot.
+  |.if X64
+  |  mov RBa, [RD]
+  |  mov PC, [BASE-4]
+  |  mov [BASE-8], RBa
+  |.else
+  |  mov RB, [RD]
+  |  mov RD, [RD+4]
+  |  mov PC, [BASE-4]
+  |  mov [BASE-8], RB
+  |  mov [BASE-4], RD
+  |.endif
+  |  jmp ->fff_res1
+  |
+  |//-- Base library: conversions ------------------------------------------
+  |
+  |.ffunc tonumber
+  |  // Only handles the number case inline (without a base argument).
+  |  cmp NARGS:RD, 1+1;  jne ->fff_fallback	// Exactly one argument.
+  |  cmp dword [BASE+4], LJ_TISNUM
+  |.if DUALNUM
+  |  jne >1
+  |  mov RB, dword [BASE]; jmp ->fff_resi
+  |1:
+  |  ja ->fff_fallback
+  |.else
+  |  jae ->fff_fallback
+  |.endif
+  |  movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
+  |
+  |.ffunc_1 tostring
+  |  // Only handles the string or number case inline.
+  |  mov PC, [BASE-4]
+  |  cmp dword [BASE+4], LJ_TSTR;  jne >3
+  |  // A __tostring method in the string base metatable is ignored.
+  |  mov STR:RD, [BASE]
+  |2:
+  |  mov dword [BASE-4], LJ_TSTR
+  |  mov [BASE-8], STR:RD
+  |  jmp ->fff_res1
+  |3:  // Handle numbers inline, unless a number base metatable is present.
+  |  cmp dword [BASE+4], LJ_TISNUM;  ja ->fff_fallback
+  |  cmp dword [DISPATCH+DISPATCH_GL(gcroot[GCROOT_BASEMT_NUM])], 0
+  |  jne ->fff_fallback
+  |  ffgccheck				// Caveat: uses label 1.
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE		// Add frame since C call can throw.
+  |  mov SAVE_PC, PC			// Redundant (but a defined value).
+  |.if X64 and not X64WIN
+  |  mov FCARG2, BASE			// Otherwise: FCARG2 == BASE
+  |.endif
+  |  mov L:FCARG1, L:RB
+  |.if DUALNUM
+  |  call extern lj_strfmt_number@8	// (lua_State *L, cTValue *o)
+  |.else
+  |  call extern lj_strfmt_num@8	// (lua_State *L, lua_Number *np)
+  |.endif
+  |  // GCstr returned in eax (RD).
+  |  mov BASE, L:RB->base
+  |  jmp <2
+  |
+  |//-- Base library: iterators -------------------------------------------
+  |
+  |.ffunc_1 next
+  |  je >2				// Missing 2nd arg?
+  |1:
+  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
+  |  mov PC, [BASE-4]
+  |  mov RB, BASE			// Save BASE.
+  |.if X64WIN
+  |  mov CARG1d, [BASE]
+  |  lea CARG3d, [BASE-8]
+  |  lea CARG2d, [BASE+8]		// Caveat: CARG2d == BASE.
+  |.elif X64
+  |  mov CARG1d, [BASE]
+  |  lea CARG2d, [BASE+8]
+  |  lea CARG3d, [BASE-8]		// Caveat: CARG3d == BASE.
+  |.else
+  |  mov TAB:RD, [BASE]
+  |  mov ARG1, TAB:RD
+  |  add BASE, 8
+  |  mov ARG2, BASE
+  |  sub BASE, 8+8
+  |  mov ARG3, BASE
+  |.endif
+  |  call extern lj_tab_next		// (GCtab *t, cTValue *key, TValue *o)
+  |  // 1=found, 0=end, -1=error returned in eax (RD).
+  |  mov BASE, RB			// Restore BASE.
+  |  test RD, RD;  jg ->fff_res2	// Found key/value.
+  |  js ->fff_fallback_2		// Invalid key.
+  |  // End of traversal: return nil.
+  |  mov dword [BASE-4], LJ_TNIL
+  |  jmp ->fff_res1
+  |2:  // Set missing 2nd arg to nil.
+  |  mov dword [BASE+12], LJ_TNIL
+  |  jmp <1
+  |
+  |.ffunc_1 pairs
+  |  mov TAB:RB, [BASE]
+  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
+#if LJ_52
+  |  cmp dword TAB:RB->metatable, 0; jne ->fff_fallback
+#endif
+  |  mov CFUNC:RB, [BASE-8]
+  |  mov CFUNC:RD, CFUNC:RB->upvalue[0]
+  |  mov PC, [BASE-4]
+  |  mov dword [BASE-4], LJ_TFUNC
+  |  mov [BASE-8], CFUNC:RD
+  |  mov dword [BASE+12], LJ_TNIL
+  |  mov RD, 1+3
+  |  jmp ->fff_res
+  |
+  |.ffunc_2 ipairs_aux
+  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
+  |  cmp dword [BASE+12], LJ_TISNUM
+  |.if DUALNUM
+  |  jne ->fff_fallback
+  |.else
+  |  jae ->fff_fallback
+  |.endif
+  |  mov PC, [BASE-4]
+  |.if DUALNUM
+  |  mov RD, dword [BASE+8]
+  |  add RD, 1
+  |  mov dword [BASE-4], LJ_TISNUM
+  |  mov dword [BASE-8], RD
+  |.else
+  |  movsd xmm0, qword [BASE+8]
+  |  sseconst_1 xmm1, RBa
+  |  addsd xmm0, xmm1
+  |  cvttsd2si RD, xmm0
+  |  movsd qword [BASE-8], xmm0
+  |.endif
+  |  mov TAB:RB, [BASE]
+  |  cmp RD, TAB:RB->asize;  jae >2	// Not in array part?
+  |  shl RD, 3
+  |  add RD, TAB:RB->array
+  |1:
+  |  cmp dword [RD+4], LJ_TNIL;  je ->fff_res0
+  |  // Copy array slot.
+  |.if X64
+  |  mov RBa, [RD]
+  |  mov [BASE], RBa
+  |.else
+  |  mov RB, [RD]
+  |  mov RD, [RD+4]
+  |  mov [BASE], RB
+  |  mov [BASE+4], RD
+  |.endif
+  |->fff_res2:
+  |  mov RD, 1+2
+  |  jmp ->fff_res
+  |2:  // Check for empty hash part first. Otherwise call C function.
+  |  cmp dword TAB:RB->hmask, 0; je ->fff_res0
+  |  mov FCARG1, TAB:RB
+  |  mov RB, BASE			// Save BASE.
+  |  mov FCARG2, RD			// Caveat: FCARG2 == BASE
+  |  call extern lj_tab_getinth@8	// (GCtab *t, int32_t key)
+  |  // cTValue * or NULL returned in eax (RD).
+  |  mov BASE, RB
+  |  test RD, RD
+  |  jnz <1
+  |->fff_res0:
+  |  mov RD, 1+0
+  |  jmp ->fff_res
+  |
+  |.ffunc_1 ipairs
+  |  mov TAB:RB, [BASE]
+  |  cmp dword [BASE+4], LJ_TTAB;  jne ->fff_fallback
+#if LJ_52
+  |  cmp dword TAB:RB->metatable, 0; jne ->fff_fallback
+#endif
+  |  mov CFUNC:RB, [BASE-8]
+  |  mov CFUNC:RD, CFUNC:RB->upvalue[0]
+  |  mov PC, [BASE-4]
+  |  mov dword [BASE-4], LJ_TFUNC
+  |  mov [BASE-8], CFUNC:RD
+  |.if DUALNUM
+  |  mov dword [BASE+12], LJ_TISNUM
+  |  mov dword [BASE+8], 0
+  |.else
+  |  xorps xmm0, xmm0
+  |  movsd qword [BASE+8], xmm0
+  |.endif
+  |  mov RD, 1+3
+  |  jmp ->fff_res
+  |
+  |//-- Base library: catch errors ----------------------------------------
+  |
+  |.ffunc_1 pcall
+  |  lea RA, [BASE+8]
+  |  sub NARGS:RD, 1
+  |  mov PC, 8+FRAME_PCALL
+  |1:
+  |  movzx RB, byte [DISPATCH+DISPATCH_GL(hookmask)]
+  |  shr RB, HOOK_ACTIVE_SHIFT
+  |  and RB, 1
+  |  add PC, RB				// Remember active hook before pcall.
+  |  jmp ->vm_call_dispatch
+  |
+  |.ffunc_2 xpcall
+  |  cmp dword [BASE+12], LJ_TFUNC;  jne ->fff_fallback
+  |  mov RB, [BASE+4]			// Swap function and traceback.
+  |  mov [BASE+12], RB
+  |  mov dword [BASE+4], LJ_TFUNC
+  |  mov LFUNC:RB, [BASE]
+  |  mov PC, [BASE+8]
+  |  mov [BASE+8], LFUNC:RB
+  |  mov [BASE], PC
+  |  lea RA, [BASE+16]
+  |  sub NARGS:RD, 2
+  |  mov PC, 16+FRAME_PCALL
+  |  jmp <1
+  |
+  |//-- Coroutine library --------------------------------------------------
+  |
+  |.macro coroutine_resume_wrap, resume
+  |.if resume
+  |.ffunc_1 coroutine_resume
+  |  mov L:RB, [BASE]
+  |.else
+  |.ffunc coroutine_wrap_aux
+  |  mov CFUNC:RB, [BASE-8]
+  |  mov L:RB, CFUNC:RB->upvalue[0].gcr
+  |.endif
+  |  mov PC, [BASE-4]
+  |  mov SAVE_PC, PC
+  |.if X64
+  |  mov TMP1, L:RB
+  |.else
+  |  mov ARG1, L:RB
+  |.endif
+  |.if resume
+  |  cmp dword [BASE+4], LJ_TTHREAD;  jne ->fff_fallback
+  |.endif
+  |  cmp aword L:RB->cframe, 0; jne ->fff_fallback
+  |  cmp byte L:RB->status, LUA_YIELD;  ja ->fff_fallback
+  |  mov RA, L:RB->top
+  |  je >1				// Status != LUA_YIELD (i.e. 0)?
+  |  cmp RA, L:RB->base			// Check for presence of initial func.
+  |  je ->fff_fallback
+  |1:
+  |.if resume
+  |  lea PC, [RA+NARGS:RD*8-16]		// Check stack space (-1-thread).
+  |.else
+  |  lea PC, [RA+NARGS:RD*8-8]		// Check stack space (-1).
+  |.endif
+  |  cmp PC, L:RB->maxstack; ja ->fff_fallback
+  |  mov L:RB->top, PC
+  |
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |.if resume
+  |  add BASE, 8			// Keep resumed thread in stack for GC.
+  |.endif
+  |  mov L:RB->top, BASE
+  |.if resume
+  |  lea RB, [BASE+NARGS:RD*8-24]	// RB = end of source for stack move.
+  |.else
+  |  lea RB, [BASE+NARGS:RD*8-16]	// RB = end of source for stack move.
+  |.endif
+  |  sub RBa, PCa			// Relative to PC.
+  |
+  |  cmp PC, RA
+  |  je >3
+  |2:  // Move args to coroutine.
+  |.if X64
+  |  mov RCa, [PC+RB]
+  |  mov [PC-8], RCa
+  |.else
+  |  mov RC, [PC+RB+4]
+  |  mov [PC-4], RC
+  |  mov RC, [PC+RB]
+  |  mov [PC-8], RC
+  |.endif
+  |  sub PC, 8
+  |  cmp PC, RA
+  |  jne <2
+  |3:
+  |.if X64
+  |  mov CARG2d, RA
+  |  mov CARG1d, TMP1
+  |.else
+  |  mov ARG2, RA
+  |  xor RA, RA
+  |  mov ARG4, RA
+  |  mov ARG3, RA
+  |.endif
+  |  call ->vm_resume			// (lua_State *L, TValue *base, 0, 0)
+  |
+  |  mov L:RB, SAVE_L
+  |.if X64
+  |  mov L:PC, TMP1
+  |.else
+  |  mov L:PC, ARG1			// The callee doesn't modify SAVE_L.
+  |.endif
+  |  mov BASE, L:RB->base
+  |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+  |  set_vmstate INTERP
+  |
+  |  cmp eax, LUA_YIELD
+  |  ja >8
+  |4:
+  |  mov RA, L:PC->base
+  |  mov KBASE, L:PC->top
+  |  mov L:PC->top, RA			// Clear coroutine stack.
+  |  mov PC, KBASE
+  |  sub PC, RA
+  |  je >6				// No results?
+  |  lea RD, [BASE+PC]
+  |  shr PC, 3
+  |  cmp RD, L:RB->maxstack
+  |  ja >9				// Need to grow stack?
+  |
+  |  mov RB, BASE
+  |  sub RBa, RAa
+  |5:  // Move results from coroutine.
+  |.if X64
+  |  mov RDa, [RA]
+  |  mov [RA+RB], RDa
+  |.else
+  |  mov RD, [RA]
+  |  mov [RA+RB], RD
+  |  mov RD, [RA+4]
+  |  mov [RA+RB+4], RD
+  |.endif
+  |  add RA, 8
+  |  cmp RA, KBASE
+  |  jne <5
+  |6:
+  |.if resume
+  |  lea RD, [PC+2]			// nresults+1 = 1 + true + results.
+  |  mov dword [BASE-4], LJ_TTRUE	// Prepend true to results.
+  |.else
+  |  lea RD, [PC+1]			// nresults+1 = 1 + results.
+  |.endif
+  |7:
+  |  mov PC, SAVE_PC
+  |  mov MULTRES, RD
+  |.if resume
+  |  mov RAa, -8
+  |.else
+  |  xor RA, RA
+  |.endif
+  |  test PC, FRAME_TYPE
+  |  jz ->BC_RET_Z
+  |  jmp ->vm_return
+  |
+  |8:  // Coroutine returned with error (at co->top-1).
+  |.if resume
+  |  mov dword [BASE-4], LJ_TFALSE	// Prepend false to results.
+  |  mov RA, L:PC->top
+  |  sub RA, 8
+  |  mov L:PC->top, RA			// Clear error from coroutine stack.
+  |  // Copy error message.
+  |.if X64
+  |  mov RDa, [RA]
+  |  mov [BASE], RDa
+  |.else
+  |  mov RD, [RA]
+  |  mov [BASE], RD
+  |  mov RD, [RA+4]
+  |  mov [BASE+4], RD
+  |.endif
+  |  mov RD, 1+2			// nresults+1 = 1 + false + error.
+  |  jmp <7
+  |.else
+  |  mov FCARG2, L:PC
+  |  mov FCARG1, L:RB
+  |  call extern lj_ffh_coroutine_wrap_err@8  // (lua_State *L, lua_State *co)
+  |  // Error function does not return.
+  |.endif
+  |
+  |9:  // Handle stack expansion on return from yield.
+  |.if X64
+  |  mov L:RA, TMP1
+  |.else
+  |  mov L:RA, ARG1			// The callee doesn't modify SAVE_L.
+  |.endif
+  |  mov L:RA->top, KBASE		// Undo coroutine stack clearing.
+  |  mov FCARG2, PC
+  |  mov FCARG1, L:RB
+  |  call extern lj_state_growstack@8	// (lua_State *L, int n)
+  |.if X64
+  |  mov L:PC, TMP1
+  |.else
+  |  mov L:PC, ARG1
+  |.endif
+  |  mov BASE, L:RB->base
+  |  jmp <4				// Retry the stack move.
+  |.endmacro
+  |
+  |  coroutine_resume_wrap 1		// coroutine.resume
+  |  coroutine_resume_wrap 0		// coroutine.wrap
+  |
+  |.ffunc coroutine_yield
+  |  mov L:RB, SAVE_L
+  |  test aword L:RB->cframe, CFRAME_RESUME
+  |  jz ->fff_fallback
+  |  mov L:RB->base, BASE
+  |  lea RD, [BASE+NARGS:RD*8-8]
+  |  mov L:RB->top, RD
+  |  xor RD, RD
+  |  mov aword L:RB->cframe, RDa
+  |  mov al, LUA_YIELD
+  |  mov byte L:RB->status, al
+  |  jmp ->vm_leave_unw
+  |
+  |//-- Math library -------------------------------------------------------
+  |
+  |.if not DUALNUM
+  |->fff_resi:  // Dummy.
+  |.endif
+  |
+  |->fff_resn:
+  |  mov PC, [BASE-4]
+  |  fstp qword [BASE-8]
+  |  jmp ->fff_res1
+  |
+  |  .ffunc_1 math_abs
+  |.if DUALNUM
+  |  cmp dword [BASE+4], LJ_TISNUM; jne >2
+  |  mov RB, dword [BASE]
+  |  cmp RB, 0; jns ->fff_resi
+  |  neg RB; js >1
+  |->fff_resbit:
+  |->fff_resi:
+  |  mov PC, [BASE-4]
+  |  mov dword [BASE-4], LJ_TISNUM
+  |  mov dword [BASE-8], RB
+  |  jmp ->fff_res1
+  |1:
+  |  mov PC, [BASE-4]
+  |  mov dword [BASE-4], 0x41e00000  // 2^31.
+  |  mov dword [BASE-8], 0
+  |  jmp ->fff_res1
+  |2:
+  |  ja ->fff_fallback
+  |.else
+  |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+  |.endif
+  |  movsd xmm0, qword [BASE]
+  |  sseconst_abs xmm1, RDa
+  |  andps xmm0, xmm1
+  |->fff_resxmm0:
+  |  mov PC, [BASE-4]
+  |  movsd qword [BASE-8], xmm0
+  |  // fallthrough
+  |
+  |->fff_res1:
+  |  mov RD, 1+1
+  |->fff_res:
+  |  mov MULTRES, RD
+  |->fff_res_:
+  |  test PC, FRAME_TYPE
+  |  jnz >7
+  |5:
+  |  cmp PC_RB, RDL			// More results expected?
+  |  ja >6
+  |  // Adjust BASE. KBASE is assumed to be set for the calling frame.
+  |  movzx RA, PC_RA
+  |  not RAa				// Note: ~RA = -(RA+1)
+  |  lea BASE, [BASE+RA*8]		// base = base - (RA+1)*8
+  |  ins_next
+  |
+  |6:  // Fill up results with nil.
+  |  mov dword [BASE+RD*8-12], LJ_TNIL
+  |  add RD, 1
+  |  jmp <5
+  |
+  |7:  // Non-standard return case.
+  |  mov RAa, -8			// Results start at BASE+RA = BASE-8.
+  |  jmp ->vm_return
+  |
+  |.if X64
+  |.define fff_resfp, fff_resxmm0
+  |.else
+  |.define fff_resfp, fff_resn
+  |.endif
+  |
+  |.macro math_round, func
+  |  .ffunc math_ .. func
+  |.if DUALNUM
+  |  cmp dword [BASE+4], LJ_TISNUM; jne >1
+  |  mov RB, dword [BASE]; jmp ->fff_resi
+  |1:
+  |  ja ->fff_fallback
+  |.else
+  |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+  |.endif
+  |  movsd xmm0, qword [BASE]
+  |  call ->vm_ .. func .. _sse
+  |.if DUALNUM
+  |  cvttsd2si RB, xmm0
+  |  cmp RB, 0x80000000
+  |  jne ->fff_resi
+  |  cvtsi2sd xmm1, RB
+  |  ucomisd xmm0, xmm1
+  |  jp ->fff_resxmm0
+  |  je ->fff_resi
+  |.endif
+  |  jmp ->fff_resxmm0
+  |.endmacro
+  |
+  |  math_round floor
+  |  math_round ceil
+  |
+  |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
+  |
+  |.ffunc math_log
+  |  cmp NARGS:RD, 1+1; jne ->fff_fallback	// Exactly one argument.
+  |  cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+  |  movsd xmm0, qword [BASE]
+  |.if not X64
+  |  movsd FPARG1, xmm0
+  |.endif
+  |  mov RB, BASE
+  |  call extern log
+  |  mov BASE, RB
+  |  jmp ->fff_resfp
+  |
+  |.macro math_extern, func
+  |  .ffunc_nsse math_ .. func
+  |.if not X64
+  |  movsd FPARG1, xmm0
+  |.endif
+  |  mov RB, BASE
+  |  call extern func
+  |  mov BASE, RB
+  |  jmp ->fff_resfp
+  |.endmacro
+  |
+  |.macro math_extern2, func
+  |  .ffunc_nnsse math_ .. func
+  |.if not X64
+  |  movsd FPARG1, xmm0
+  |  movsd FPARG3, xmm1
+  |.endif
+  |  mov RB, BASE
+  |  call extern func
+  |  mov BASE, RB
+  |  jmp ->fff_resfp
+  |.endmacro
+  |
+  |  math_extern log10
+  |  math_extern exp
+  |  math_extern sin
+  |  math_extern cos
+  |  math_extern tan
+  |  math_extern asin
+  |  math_extern acos
+  |  math_extern atan
+  |  math_extern sinh
+  |  math_extern cosh
+  |  math_extern tanh
+  |  math_extern2 pow
+  |  math_extern2 atan2
+  |  math_extern2 fmod
+  |
+  |.ffunc_nnr math_ldexp;	fscale; fpop1;	jmp ->fff_resn
+  |
+  |.ffunc_1 math_frexp
+  |  mov RB, [BASE+4]
+  |  cmp RB, LJ_TISNUM;  jae ->fff_fallback
+  |  mov PC, [BASE-4]
+  |  mov RC, [BASE]
+  |  mov [BASE-4], RB; mov [BASE-8], RC
+  |  shl RB, 1; cmp RB, 0xffe00000; jae >3
+  |  or RC, RB; jz >3
+  |  mov RC, 1022
+  |  cmp RB, 0x00200000; jb >4
+  |1:
+  |  shr RB, 21; sub RB, RC		// Extract and unbias exponent.
+  |  cvtsi2sd xmm0, RB
+  |  mov RB, [BASE-4]
+  |  and RB, 0x800fffff			// Mask off exponent.
+  |  or RB, 0x3fe00000			// Put mantissa in range [0.5,1) or 0.
+  |  mov [BASE-4], RB
+  |2:
+  |  movsd qword [BASE], xmm0
+  |  mov RD, 1+2
+  |  jmp ->fff_res
+  |3:  // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
+  |  xorps xmm0, xmm0; jmp <2
+  |4:  // Handle denormals by multiplying with 2^54 and adjusting the bias.
+  |  movsd xmm0, qword [BASE]
+  |  sseconst_hi xmm1, RBa, 43500000  // 2^54.
+  |  mulsd xmm0, xmm1
+  |  movsd qword [BASE-8], xmm0
+  |  mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
+  |
+  |.ffunc_nsse math_modf
+  |  mov RB, [BASE+4]
+  |  mov PC, [BASE-4]
+  |  shl RB, 1; cmp RB, 0xffe00000; je >4	// +-Inf?
+  |  movaps xmm4, xmm0
+  |  call ->vm_trunc_sse
+  |  subsd xmm4, xmm0
+  |1:
+  |  movsd qword [BASE-8], xmm0
+  |  movsd qword [BASE], xmm4
+  |  mov RC, [BASE-4]; mov RB, [BASE+4]
+  |  xor RC, RB; js >3				// Need to adjust sign?
+  |2:
+  |  mov RD, 1+2
+  |  jmp ->fff_res
+  |3:
+  |  xor RB, 0x80000000; mov [BASE+4], RB	// Flip sign of fraction.
+  |  jmp <2
+  |4:
+  |  xorps xmm4, xmm4; jmp <1			// Return +-Inf and +-0.
+  |
+  |.macro math_minmax, name, cmovop, sseop
+  |  .ffunc_1 name
+  |  mov RA, 2
+  |  cmp dword [BASE+4], LJ_TISNUM
+  |.if DUALNUM
+  |  jne >4
+  |  mov RB, dword [BASE]
+  |1:  // Handle integers.
+  |  cmp RA, RD; jae ->fff_resi
+  |  cmp dword [BASE+RA*8-4], LJ_TISNUM; jne >3
+  |  cmp RB, dword [BASE+RA*8-8]
+  |  cmovop RB, dword [BASE+RA*8-8]
+  |  add RA, 1
+  |  jmp <1
+  |3:
+  |  ja ->fff_fallback
+  |  // Convert intermediate result to number and continue below.
+  |  cvtsi2sd xmm0, RB
+  |  jmp >6
+  |4:
+  |  ja ->fff_fallback
+  |.else
+  |  jae ->fff_fallback
+  |.endif
+  |
+  |  movsd xmm0, qword [BASE]
+  |5:  // Handle numbers or integers.
+  |  cmp RA, RD; jae ->fff_resxmm0
+  |  cmp dword [BASE+RA*8-4], LJ_TISNUM
+  |.if DUALNUM
+  |  jb >6
+  |  ja ->fff_fallback
+  |  cvtsi2sd xmm1, dword [BASE+RA*8-8]
+  |  jmp >7
+  |.else
+  |  jae ->fff_fallback
+  |.endif
+  |6:
+  |  movsd xmm1, qword [BASE+RA*8-8]
+  |7:
+  |  sseop xmm0, xmm1
+  |  add RA, 1
+  |  jmp <5
+  |.endmacro
+  |
+  |  math_minmax math_min, cmovg, minsd
+  |  math_minmax math_max, cmovl, maxsd
+  |
+  |//-- String library -----------------------------------------------------
+  |
+  |.ffunc string_byte			// Only handle the 1-arg case here.
+  |  cmp NARGS:RD, 1+1;  jne ->fff_fallback
+  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
+  |  mov STR:RB, [BASE]
+  |  mov PC, [BASE-4]
+  |  cmp dword STR:RB->len, 1
+  |  jb ->fff_res0			// Return no results for empty string.
+  |  movzx RB, byte STR:RB[1]
+  |.if DUALNUM
+  |  jmp ->fff_resi
+  |.else
+  |  cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
+  |.endif
+  |
+  |.ffunc string_char			// Only handle the 1-arg case here.
+  |  ffgccheck
+  |  cmp NARGS:RD, 1+1;  jne ->fff_fallback	// *Exactly* 1 arg.
+  |  cmp dword [BASE+4], LJ_TISNUM
+  |.if DUALNUM
+  |  jne ->fff_fallback
+  |  mov RB, dword [BASE]
+  |  cmp RB, 255;  ja ->fff_fallback
+  |  mov TMP2, RB
+  |.else
+  |  jae ->fff_fallback
+  |  cvttsd2si RB, qword [BASE]
+  |  cmp RB, 255;  ja ->fff_fallback
+  |  mov TMP2, RB
+  |.endif
+  |.if X64
+  |  mov TMP3, 1
+  |.else
+  |  mov ARG3, 1
+  |.endif
+  |  lea RDa, TMP2			// Points to stack. Little-endian.
+  |->fff_newstr:
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |.if X64
+  |  mov CARG3d, TMP3			// Zero-extended to size_t.
+  |  mov CARG2, RDa			// May be 64 bit ptr to stack.
+  |  mov CARG1d, L:RB
+  |.else
+  |  mov ARG2, RD
+  |  mov ARG1, L:RB
+  |.endif
+  |  mov SAVE_PC, PC
+  |  call extern lj_str_new		// (lua_State *L, char *str, size_t l)
+  |->fff_resstr:
+  |  // GCstr * returned in eax (RD).
+  |  mov BASE, L:RB->base
+  |  mov PC, [BASE-4]
+  |  mov dword [BASE-4], LJ_TSTR
+  |  mov [BASE-8], STR:RD
+  |  jmp ->fff_res1
+  |
+  |.ffunc string_sub
+  |  ffgccheck
+  |  mov TMP2, -1
+  |  cmp NARGS:RD, 1+2;  jb ->fff_fallback
+  |  jna >1
+  |  cmp dword [BASE+20], LJ_TISNUM
+  |.if DUALNUM
+  |  jne ->fff_fallback
+  |  mov RB, dword [BASE+16]
+  |  mov TMP2, RB
+  |.else
+  |  jae ->fff_fallback
+  |  cvttsd2si RB, qword [BASE+16]
+  |  mov TMP2, RB
+  |.endif
+  |1:
+  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
+  |  cmp dword [BASE+12], LJ_TISNUM
+  |.if DUALNUM
+  |  jne ->fff_fallback
+  |.else
+  |  jae ->fff_fallback
+  |.endif
+  |  mov STR:RB, [BASE]
+  |  mov TMP3, STR:RB
+  |  mov RB, STR:RB->len
+  |.if DUALNUM
+  |  mov RA, dword [BASE+8]
+  |.else
+  |  cvttsd2si RA, qword [BASE+8]
+  |.endif
+  |  mov RC, TMP2
+  |  cmp RB, RC				// len < end? (unsigned compare)
+  |  jb >5
+  |2:
+  |  test RA, RA			// start <= 0?
+  |  jle >7
+  |3:
+  |  mov STR:RB, TMP3
+  |  sub RC, RA				// start > end?
+  |  jl ->fff_emptystr
+  |  lea RB, [STR:RB+RA+#STR-1]
+  |  add RC, 1
+  |4:
+  |.if X64
+  |  mov TMP3, RC
+  |.else
+  |  mov ARG3, RC
+  |.endif
+  |  mov RD, RB
+  |  jmp ->fff_newstr
+  |
+  |5:  // Negative end or overflow.
+  |  jl >6
+  |  lea RC, [RC+RB+1]			// end = end+(len+1)
+  |  jmp <2
+  |6:  // Overflow.
+  |  mov RC, RB				// end = len
+  |  jmp <2
+  |
+  |7:  // Negative start or underflow.
+  |  je >8
+  |  add RA, RB				// start = start+(len+1)
+  |  add RA, 1
+  |  jg <3				// start > 0?
+  |8:  // Underflow.
+  |  mov RA, 1				// start = 1
+  |  jmp <3
+  |
+  |->fff_emptystr:  // Range underflow.
+  |  xor RC, RC				// Zero length. Any ptr in RB is ok.
+  |  jmp <4
+  |
+  |.macro ffstring_op, name
+  |  .ffunc_1 string_ .. name
+  |  ffgccheck
+  |  cmp dword [BASE+4], LJ_TSTR;  jne ->fff_fallback
+  |  mov L:RB, SAVE_L
+  |   lea SBUF:FCARG1, [DISPATCH+DISPATCH_GL(tmpbuf)]
+  |  mov L:RB->base, BASE
+  |  mov STR:FCARG2, [BASE]		// Caveat: FCARG2 == BASE
+  |   mov RCa, SBUF:FCARG1->b
+  |   mov SBUF:FCARG1->L, L:RB
+  |   mov SBUF:FCARG1->w, RCa
+  |  mov SAVE_PC, PC
+  |  call extern lj_buf_putstr_ .. name .. @8
+  |  mov FCARG1, eax
+  |  call extern lj_buf_tostr@4
+  |  jmp ->fff_resstr
+  |.endmacro
+  |
+  |ffstring_op reverse
+  |ffstring_op lower
+  |ffstring_op upper
+  |
+  |//-- Bit library --------------------------------------------------------
+  |
+  |.macro .ffunc_bit, name, kind, fdef
+  |  fdef name
+  |.if kind == 2
+  |  sseconst_tobit xmm1, RBa
+  |.endif
+  |  cmp dword [BASE+4], LJ_TISNUM
+  |.if DUALNUM
+  |  jne >1
+  |  mov RB, dword [BASE]
+  |.if kind > 0
+  |  jmp >2
+  |.else
+  |  jmp ->fff_resbit
+  |.endif
+  |1:
+  |  ja ->fff_fallback
+  |.else
+  |  jae ->fff_fallback
+  |.endif
+  |  movsd xmm0, qword [BASE]
+  |.if kind < 2
+  |  sseconst_tobit xmm1, RBa
+  |.endif
+  |  addsd xmm0, xmm1
+  |  movd RB, xmm0
+  |2:
+  |.endmacro
+  |
+  |.macro .ffunc_bit, name, kind
+  |  .ffunc_bit name, kind, .ffunc_1
+  |.endmacro
+  |
+  |.ffunc_bit bit_tobit, 0
+  |  jmp ->fff_resbit
+  |
+  |.macro .ffunc_bit_op, name, ins
+  |  .ffunc_bit name, 2
+  |  mov TMP2, NARGS:RD			// Save for fallback.
+  |  lea RD, [BASE+NARGS:RD*8-16]
+  |1:
+  |  cmp RD, BASE
+  |  jbe ->fff_resbit
+  |  cmp dword [RD+4], LJ_TISNUM
+  |.if DUALNUM
+  |  jne >2
+  |  ins RB, dword [RD]
+  |  sub RD, 8
+  |  jmp <1
+  |2:
+  |  ja ->fff_fallback_bit_op
+  |.else
+  |  jae ->fff_fallback_bit_op
+  |.endif
+  |  movsd xmm0, qword [RD]
+  |  addsd xmm0, xmm1
+  |  movd RA, xmm0
+  |  ins RB, RA
+  |  sub RD, 8
+  |  jmp <1
+  |.endmacro
+  |
+  |.ffunc_bit_op bit_band, and
+  |.ffunc_bit_op bit_bor, or
+  |.ffunc_bit_op bit_bxor, xor
+  |
+  |.ffunc_bit bit_bswap, 1
+  |  bswap RB
+  |  jmp ->fff_resbit
+  |
+  |.ffunc_bit bit_bnot, 1
+  |  not RB
+  |.if DUALNUM
+  |  jmp ->fff_resbit
+  |.else
+  |->fff_resbit:
+  |  cvtsi2sd xmm0, RB
+  |  jmp ->fff_resxmm0
+  |.endif
+  |
+  |->fff_fallback_bit_op:
+  |  mov NARGS:RD, TMP2			// Restore for fallback
+  |  jmp ->fff_fallback
+  |
+  |.macro .ffunc_bit_sh, name, ins
+  |.if DUALNUM
+  |  .ffunc_bit name, 1, .ffunc_2
+  |  // Note: no inline conversion from number for 2nd argument!
+  |  cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
+  |  mov RA, dword [BASE+8]
+  |.else
+  |  .ffunc_nnsse name
+  |  sseconst_tobit xmm2, RBa
+  |  addsd xmm0, xmm2
+  |  addsd xmm1, xmm2
+  |  movd RB, xmm0
+  |  movd RA, xmm1
+  |.endif
+  |  ins RB, cl				// Assumes RA is ecx.
+  |  jmp ->fff_resbit
+  |.endmacro
+  |
+  |.ffunc_bit_sh bit_lshift, shl
+  |.ffunc_bit_sh bit_rshift, shr
+  |.ffunc_bit_sh bit_arshift, sar
+  |.ffunc_bit_sh bit_rol, rol
+  |.ffunc_bit_sh bit_ror, ror
+  |
+  |//-----------------------------------------------------------------------
+  |
+  |->fff_fallback_2:
+  |  mov NARGS:RD, 1+2			// Other args are ignored, anyway.
+  |  jmp ->fff_fallback
+  |->fff_fallback_1:
+  |  mov NARGS:RD, 1+1			// Other args are ignored, anyway.
+  |->fff_fallback:			// Call fast function fallback handler.
+  |  // BASE = new base, RD = nargs+1
+  |  mov L:RB, SAVE_L
+  |  mov PC, [BASE-4]			// Fallback may overwrite PC.
+  |  mov SAVE_PC, PC			// Redundant (but a defined value).
+  |  mov L:RB->base, BASE
+  |  lea RD, [BASE+NARGS:RD*8-8]
+  |  lea RA, [RD+8*LUA_MINSTACK]	// Ensure enough space for handler.
+  |  mov L:RB->top, RD
+  |  mov CFUNC:RD, [BASE-8]
+  |  cmp RA, L:RB->maxstack
+  |  ja >5				// Need to grow stack.
+  |.if X64
+  |  mov CARG1d, L:RB
+  |.else
+  |  mov ARG1, L:RB
+  |.endif
+  |  call aword CFUNC:RD->f		// (lua_State *L)
+  |  mov BASE, L:RB->base
+  |  // Either throws an error, or recovers and returns -1, 0 or nresults+1.
+  |  test RD, RD;  jg ->fff_res		// Returned nresults+1?
+  |1:
+  |  mov RA, L:RB->top
+  |  sub RA, BASE
+  |  shr RA, 3
+  |  test RD, RD
+  |  lea NARGS:RD, [RA+1]
+  |  mov LFUNC:RB, [BASE-8]
+  |  jne ->vm_call_tail			// Returned -1?
+  |  ins_callt				// Returned 0: retry fast path.
+  |
+  |// Reconstruct previous base for vmeta_call during tailcall.
+  |->vm_call_tail:
+  |  mov RA, BASE
+  |  test PC, FRAME_TYPE
+  |  jnz >3
+  |  movzx RB, PC_RA
+  |  not RBa				// Note: ~RB = -(RB+1)
+  |  lea BASE, [BASE+RB*8]		// base = base - (RB+1)*8
+  |  jmp ->vm_call_dispatch		// Resolve again for tailcall.
+  |3:
+  |  mov RB, PC
+  |  and RB, -8
+  |  sub BASE, RB
+  |  jmp ->vm_call_dispatch		// Resolve again for tailcall.
+  |
+  |5:  // Grow stack for fallback handler.
+  |  mov FCARG2, LUA_MINSTACK
+  |  mov FCARG1, L:RB
+  |  call extern lj_state_growstack@8	// (lua_State *L, int n)
+  |  mov BASE, L:RB->base
+  |  xor RD, RD				// Simulate a return 0.
+  |  jmp <1				// Dumb retry (goes through ff first).
+  |
+  |->fff_gcstep:			// Call GC step function.
+  |  // BASE = new base, RD = nargs+1
+  |  pop RBa				// Must keep stack at same level.
+  |  mov TMPa, RBa			// Save return address
+  |  mov L:RB, SAVE_L
+  |  mov SAVE_PC, PC			// Redundant (but a defined value).
+  |  mov L:RB->base, BASE
+  |  lea RD, [BASE+NARGS:RD*8-8]
+  |  mov FCARG1, L:RB
+  |  mov L:RB->top, RD
+  |  call extern lj_gc_step@4		// (lua_State *L)
+  |  mov BASE, L:RB->base
+  |  mov RD, L:RB->top
+  |  sub RD, BASE
+  |  shr RD, 3
+  |  add NARGS:RD, 1
+  |  mov RBa, TMPa
+  |  push RBa				// Restore return address.
+  |  ret
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Special dispatch targets -------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |->vm_record:				// Dispatch target for recording phase.
+  |.if JIT
+  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
+  |  test RDL, HOOK_VMEVENT		// No recording while in vmevent.
+  |  jnz >5
+  |  // Decrement the hookcount for consistency, but always do the call.
+  |  test RDL, HOOK_ACTIVE
+  |  jnz >1
+  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
+  |  jz >1
+  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
+  |  jmp >1
+  |.endif
+  |
+  |->vm_rethook:			// Dispatch target for return hooks.
+  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
+  |  test RDL, HOOK_ACTIVE		// Hook already active?
+  |  jnz >5
+  |  jmp >1
+  |
+  |->vm_inshook:			// Dispatch target for instr/line hooks.
+  |  movzx RD, byte [DISPATCH+DISPATCH_GL(hookmask)]
+  |  test RDL, HOOK_ACTIVE		// Hook already active?
+  |  jnz >5
+  |
+  |  test RDL, LUA_MASKLINE|LUA_MASKCOUNT
+  |  jz >5
+  |  dec dword [DISPATCH+DISPATCH_GL(hookcount)]
+  |  jz >1
+  |  test RDL, LUA_MASKLINE
+  |  jz >5
+  |1:
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov FCARG2, PC			// Caveat: FCARG2 == BASE
+  |  mov FCARG1, L:RB
+  |  // SAVE_PC must hold the _previous_ PC. The callee updates it with PC.
+  |  call extern lj_dispatch_ins@8	// (lua_State *L, const BCIns *pc)
+  |3:
+  |  mov BASE, L:RB->base
+  |4:
+  |  movzx RA, PC_RA
+  |5:
+  |  movzx OP, PC_OP
+  |  movzx RD, PC_RD
+  |.if X64
+  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]	// Re-dispatch to static ins.
+  |.else
+  |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]	// Re-dispatch to static ins.
+  |.endif
+  |
+  |->cont_hook:				// Continue from hook yield.
+  |  add PC, 4
+  |  mov RA, [RB-24]
+  |  mov MULTRES, RA			// Restore MULTRES for *M ins.
+  |  jmp <4
+  |
+  |->vm_hotloop:			// Hot loop counter underflow.
+  |.if JIT
+  |  mov LFUNC:RB, [BASE-8]		// Same as curr_topL(L).
+  |  mov RB, LFUNC:RB->pc
+  |  movzx RD, byte [RB+PC2PROTO(framesize)]
+  |  lea RD, [BASE+RD*8]
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov L:RB->top, RD
+  |  mov FCARG2, PC
+  |  lea FCARG1, [DISPATCH+GG_DISP2J]
+  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
+  |  mov SAVE_PC, PC
+  |  call extern lj_trace_hot@8		// (jit_State *J, const BCIns *pc)
+  |  jmp <3
+  |.endif
+  |
+  |->vm_callhook:			// Dispatch target for call hooks.
+  |  mov SAVE_PC, PC
+  |.if JIT
+  |  jmp >1
+  |.endif
+  |
+  |->vm_hotcall:			// Hot call counter underflow.
+  |.if JIT
+  |  mov SAVE_PC, PC
+  |  or PC, 1				// Marker for hot call.
+  |1:
+  |.endif
+  |  lea RD, [BASE+NARGS:RD*8-8]
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov L:RB->top, RD
+  |  mov FCARG2, PC
+  |  mov FCARG1, L:RB
+  |  call extern lj_dispatch_call@8	// (lua_State *L, const BCIns *pc)
+  |  // ASMFunction returned in eax/rax (RDa).
+  |  mov SAVE_PC, 0			// Invalidate for subsequent line hook.
+  |.if JIT
+  |  and PC, -2
+  |.endif
+  |  mov BASE, L:RB->base
+  |  mov RAa, RDa
+  |  mov RD, L:RB->top
+  |  sub RD, BASE
+  |  mov RBa, RAa
+  |  movzx RA, PC_RA
+  |  shr RD, 3
+  |  add NARGS:RD, 1
+  |  jmp RBa
+  |
+  |->cont_stitch:			// Trace stitching.
+  |.if JIT
+  |  // BASE = base, RC = result, RB = mbase
+  |  mov TRACE:RA, [RB-24]		// Save previous trace.
+  |  mov TMP1, TRACE:RA
+  |  mov TMP3, DISPATCH			// Need one more register.
+  |  mov DISPATCH, MULTRES
+  |  movzx RA, PC_RA
+  |  lea RA, [BASE+RA*8]		// Call base.
+  |  sub DISPATCH, 1
+  |  jz >2
+  |1:  // Move results down.
+  |.if X64
+  |  mov RBa, [RC]
+  |  mov [RA], RBa
+  |.else
+  |  mov RB, [RC]
+  |  mov [RA], RB
+  |  mov RB, [RC+4]
+  |  mov [RA+4], RB
+  |.endif
+  |  add RC, 8
+  |  add RA, 8
+  |  sub DISPATCH, 1
+  |  jnz <1
+  |2:
+  |  movzx RC, PC_RA
+  |  movzx RB, PC_RB
+  |  add RC, RB
+  |  lea RC, [BASE+RC*8-8]
+  |3:
+  |  cmp RC, RA
+  |  ja >9				// More results wanted?
+  |
+  |  mov DISPATCH, TMP3
+  |  mov TRACE:RD, TMP1			// Get previous trace.
+  |  movzx RB, word TRACE:RD->traceno
+  |  movzx RD, word TRACE:RD->link
+  |  cmp RD, RB
+  |  je ->cont_nop			// Blacklisted.
+  |  test RD, RD
+  |  jne =>BC_JLOOP			// Jump to stitched trace.
+  |
+  |  // Stitch a new trace to the previous trace.
+  |  mov [DISPATCH+DISPATCH_J(exitno)], RB
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov FCARG2, PC
+  |  lea FCARG1, [DISPATCH+GG_DISP2J]
+  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
+  |  call extern lj_dispatch_stitch@8	// (jit_State *J, const BCIns *pc)
+  |  mov BASE, L:RB->base
+  |  jmp ->cont_nop
+  |
+  |9:  // Fill up results with nil.
+  |  mov dword [RA+4], LJ_TNIL
+  |  add RA, 8
+  |  jmp <3
+  |.endif
+  |
+  |->vm_profhook:			// Dispatch target for profiler hook.
+#if LJ_HASPROFILE
+  |  mov L:RB, SAVE_L
+  |  mov L:RB->base, BASE
+  |  mov FCARG2, PC			// Caveat: FCARG2 == BASE
+  |  mov FCARG1, L:RB
+  |  call extern lj_dispatch_profile@8	// (lua_State *L, const BCIns *pc)
+  |  mov BASE, L:RB->base
+  |  // HOOK_PROFILE is off again, so re-dispatch to dynamic instruction.
+  |  sub PC, 4
+  |  jmp ->cont_nop
+#endif
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Trace exit handler -------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// Called from an exit stub with the exit number on the stack.
+  |// The 16 bit exit number is stored with two (sign-extended) push imm8.
+  |->vm_exit_handler:
+  |.if JIT
+  |.if X64
+  |  push r13; push r12
+  |  push r11; push r10; push r9; push r8
+  |  push rdi; push rsi; push rbp; lea rbp, [rsp+88]; push rbp
+  |  push rbx; push rdx; push rcx; push rax
+  |  movzx RC, byte [rbp-8]		// Reconstruct exit number.
+  |  mov RCH, byte [rbp-16]
+  |  mov [rbp-8], r15; mov [rbp-16], r14
+  |.else
+  |  push ebp; lea ebp, [esp+12]; push ebp
+  |  push ebx; push edx; push ecx; push eax
+  |  movzx RC, byte [ebp-4]		// Reconstruct exit number.
+  |  mov RCH, byte [ebp-8]
+  |  mov [ebp-4], edi; mov [ebp-8], esi
+  |.endif
+  |  // Caveat: DISPATCH is ebx.
+  |  mov DISPATCH, [ebp]
+  |  mov RA, [DISPATCH+DISPATCH_GL(vmstate)]	// Get trace number.
+  |  set_vmstate EXIT
+  |  mov [DISPATCH+DISPATCH_J(exitno)], RC
+  |  mov [DISPATCH+DISPATCH_J(parent)], RA
+  |.if X64
+  |.if X64WIN
+  |  sub rsp, 16*8+4*8			// Room for SSE regs + save area.
+  |.else
+  |  sub rsp, 16*8			// Room for SSE regs.
+  |.endif
+  |  add rbp, -128
+  |  movsd qword [rbp-8],   xmm15; movsd qword [rbp-16],  xmm14
+  |  movsd qword [rbp-24],  xmm13; movsd qword [rbp-32],  xmm12
+  |  movsd qword [rbp-40],  xmm11; movsd qword [rbp-48],  xmm10
+  |  movsd qword [rbp-56],  xmm9;  movsd qword [rbp-64],  xmm8
+  |  movsd qword [rbp-72],  xmm7;  movsd qword [rbp-80],  xmm6
+  |  movsd qword [rbp-88],  xmm5;  movsd qword [rbp-96],  xmm4
+  |  movsd qword [rbp-104], xmm3;  movsd qword [rbp-112], xmm2
+  |  movsd qword [rbp-120], xmm1;  movsd qword [rbp-128], xmm0
+  |.else
+  |  sub esp, 8*8+16			// Room for SSE regs + args.
+  |  movsd qword [ebp-40], xmm7; movsd qword [ebp-48], xmm6
+  |  movsd qword [ebp-56], xmm5; movsd qword [ebp-64], xmm4
+  |  movsd qword [ebp-72], xmm3; movsd qword [ebp-80], xmm2
+  |  movsd qword [ebp-88], xmm1; movsd qword [ebp-96], xmm0
+  |.endif
+  |  // Caveat: RB is ebp.
+  |  mov L:RB, [DISPATCH+DISPATCH_GL(cur_L)]
+  |  mov BASE, [DISPATCH+DISPATCH_GL(jit_base)]
+  |  mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
+  |  mov L:RB->base, BASE
+  |.if X64WIN
+  |  lea CARG2, [rsp+4*8]
+  |.elif X64
+  |  mov CARG2, rsp
+  |.else
+  |  lea FCARG2, [esp+16]
+  |.endif
+  |  lea FCARG1, [DISPATCH+GG_DISP2J]
+  |  mov dword [DISPATCH+DISPATCH_GL(jit_base)], 0
+  |  call extern lj_trace_exit@8	// (jit_State *J, ExitState *ex)
+  |  // MULTRES or negated error code returned in eax (RD).
+  |  mov RAa, L:RB->cframe
+  |  and RAa, CFRAME_RAWMASK
+  |.if X64WIN
+  |  // Reposition stack later.
+  |.elif X64
+  |  mov rsp, RAa			// Reposition stack to C frame.
+  |.else
+  |  mov esp, RAa			// Reposition stack to C frame.
+  |.endif
+  |  mov [RAa+CFRAME_OFS_L], L:RB	// Set SAVE_L (on-trace resume/yield).
+  |  mov BASE, L:RB->base
+  |  mov PC, [RAa+CFRAME_OFS_PC]	// Get SAVE_PC.
+  |.if X64
+  |  jmp >1
+  |.endif
+  |.endif
+  |->vm_exit_interp:
+  |  // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
+  |.if JIT
+  |.if X64
+  |  // Restore additional callee-save registers only used in compiled code.
+  |.if X64WIN
+  |  lea RAa, [rsp+9*16+4*8]
+  |1:
+  |  movdqa xmm15, [RAa-9*16]
+  |  movdqa xmm14, [RAa-8*16]
+  |  movdqa xmm13, [RAa-7*16]
+  |  movdqa xmm12, [RAa-6*16]
+  |  movdqa xmm11, [RAa-5*16]
+  |  movdqa xmm10, [RAa-4*16]
+  |  movdqa xmm9, [RAa-3*16]
+  |  movdqa xmm8, [RAa-2*16]
+  |  movdqa xmm7, [RAa-1*16]
+  |  mov rsp, RAa			// Reposition stack to C frame.
+  |  movdqa xmm6, [RAa]
+  |  mov r15, CSAVE_3
+  |  mov r14, CSAVE_4
+  |.else
+  |  add rsp, 16			// Reposition stack to C frame.
+  |1:
+  |.endif
+  |  mov r13, TMPa
+  |  mov r12, TMPQ
+  |.endif
+  |  cmp RD, -LUA_ERRERR; jae >9	// Check for error from exit.
+  |  mov L:RB, SAVE_L
+  |  mov MULTRES, RD
+  |  mov LFUNC:KBASE, [BASE-8]
+  |  mov KBASE, LFUNC:KBASE->pc
+  |  mov KBASE, [KBASE+PC2PROTO(k)]
+  |  mov L:RB->base, BASE
+  |  mov dword [DISPATCH+DISPATCH_GL(jit_base)], 0
+  |  set_vmstate INTERP
+  |  // Modified copy of ins_next which handles function header dispatch, too.
+  |  mov RC, [PC]
+  |  movzx RA, RCH
+  |  movzx OP, RCL
+  |  add PC, 4
+  |  shr RC, 16
+  |  cmp MULTRES, -17			// Static dispatch?
+  |  je >5
+  |  cmp OP, BC_FUNCF			// Function header?
+  |  jb >3
+  |  cmp OP, BC_FUNCC+2			// Fast function?
+  |  jae >4
+  |2:
+  |  mov RC, MULTRES			// RC/RD holds nres+1.
+  |3:
+  |.if X64
+  |  jmp aword [DISPATCH+OP*8]
+  |.else
+  |  jmp aword [DISPATCH+OP*4]
+  |.endif
+  |
+  |4:  // Check frame below fast function.
+  |  mov RC, [BASE-4]
+  |  test RC, FRAME_TYPE
+  |  jnz <2				// Trace stitching continuation?
+  |  // Otherwise set KBASE for Lua function below fast function.
+  |  movzx RC, byte [RC-3]
+  |  not RCa
+  |  mov LFUNC:KBASE, [BASE+RC*8-8]
+  |  mov KBASE, LFUNC:KBASE->pc
+  |  mov KBASE, [KBASE+PC2PROTO(k)]
+  |  jmp <2
+  |
+  |5:  // Dispatch to static entry of original ins replaced by BC_JLOOP.
+  |  mov RA, [DISPATCH+DISPATCH_J(trace)]
+  |  mov TRACE:RA, [RA+RD*4]
+  |  mov RC, TRACE:RA->startins
+  |  movzx RA, RCH
+  |  movzx OP, RCL
+  |  shr RC, 16
+  |.if X64
+  |  jmp aword [DISPATCH+OP*8+GG_DISP2STATIC]
+  |.else
+  |  jmp aword [DISPATCH+OP*4+GG_DISP2STATIC]
+  |.endif
+  |
+  |9:  // Rethrow error from the right C frame.
+  |  mov FCARG2, RD
+  |  mov FCARG1, L:RB
+  |  neg FCARG2
+  |  call extern lj_err_trace@8		// (lua_State *L, int errcode)
+  |.endif
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Math helper functions ----------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// FP value rounding. Called by math.floor/math.ceil fast functions
+  |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
+  |.macro vm_round, name, mode, cond
+  |->name:
+  |.if not X64 and cond
+  |  movsd xmm0, qword [esp+4]
+  |  call ->name .. _sse
+  |  movsd qword [esp+4], xmm0  // Overwrite callee-owned arg.
+  |  fld qword [esp+4]
+  |  ret
+  |.endif
+  |
+  |->name .. _sse:
+  |  sseconst_abs xmm2, RDa
+  |  sseconst_2p52 xmm3, RDa
+  |  movaps xmm1, xmm0
+  |  andpd xmm1, xmm2			// |x|
+  |  ucomisd xmm3, xmm1			// No truncation if 2^52 <= |x|.
+  |  jbe >1
+  |  andnpd xmm2, xmm0			// Isolate sign bit.
+  |.if mode == 2		// trunc(x)?
+  |  movaps xmm0, xmm1
+  |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
+  |  subsd xmm1, xmm3
+  |  sseconst_1 xmm3, RDa
+  |  cmpsd xmm0, xmm1, 1		// |x| < result?
+  |  andpd xmm0, xmm3
+  |  subsd xmm1, xmm0			// If yes, subtract -1.
+  |  orpd xmm1, xmm2			// Merge sign bit back in.
+  |.else
+  |  addsd xmm1, xmm3			// (|x| + 2^52) - 2^52
+  |  subsd xmm1, xmm3
+  |  orpd xmm1, xmm2			// Merge sign bit back in.
+  |  sseconst_1 xmm3, RDa
+  |  .if mode == 1		// ceil(x)?
+  |    cmpsd xmm0, xmm1, 6		// x > result?
+  |    andpd xmm0, xmm3
+  |    addsd xmm1, xmm0			// If yes, add 1.
+  |    orpd xmm1, xmm2			// Merge sign bit back in (again).
+  |  .else			// floor(x)?
+  |    cmpsd xmm0, xmm1, 1		// x < result?
+  |    andpd xmm0, xmm3
+  |    subsd xmm1, xmm0			// If yes, subtract 1.
+  |  .endif
+  |.endif
+  |  movaps xmm0, xmm1
+  |1:
+  |  ret
+  |.endmacro
+  |
+  |  vm_round vm_floor, 0, 1
+  |  vm_round vm_ceil,  1, JIT
+  |  vm_round vm_trunc, 2, JIT
+  |
+  |// FP modulo x%y. Called by BC_MOD* and vm_arith.
+  |->vm_mod:
+  |// Args in xmm0/xmm1, return value in xmm0.
+  |// Caveat: xmm0-xmm5 and RC (eax) modified!
+  |  movaps xmm5, xmm0
+  |  divsd xmm0, xmm1
+  |  sseconst_abs xmm2, RDa
+  |  sseconst_2p52 xmm3, RDa
+  |  movaps xmm4, xmm0
+  |  andpd xmm4, xmm2			// |x/y|
+  |  ucomisd xmm3, xmm4			// No truncation if 2^52 <= |x/y|.
+  |  jbe >1
+  |  andnpd xmm2, xmm0			// Isolate sign bit.
+  |  addsd xmm4, xmm3			// (|x/y| + 2^52) - 2^52
+  |  subsd xmm4, xmm3
+  |  orpd xmm4, xmm2			// Merge sign bit back in.
+  |  sseconst_1 xmm2, RDa
+  |  cmpsd xmm0, xmm4, 1		// x/y < result?
+  |  andpd xmm0, xmm2
+  |  subsd xmm4, xmm0			// If yes, subtract 1.0.
+  |  movaps xmm0, xmm5
+  |  mulsd xmm1, xmm4
+  |  subsd xmm0, xmm1
+  |  ret
+  |1:
+  |  mulsd xmm1, xmm0
+  |  movaps xmm0, xmm5
+  |  subsd xmm0, xmm1
+  |  ret
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Miscellaneous functions --------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// int lj_vm_cpuid(uint32_t f, uint32_t res[4])
+  |->vm_cpuid:
+  |.if X64
+  |  mov eax, CARG1d
+  |  .if X64WIN; push rsi; mov rsi, CARG2; .endif
+  |  push rbx
+  |  xor ecx, ecx
+  |  cpuid
+  |  mov [rsi], eax
+  |  mov [rsi+4], ebx
+  |  mov [rsi+8], ecx
+  |  mov [rsi+12], edx
+  |  pop rbx
+  |  .if X64WIN; pop rsi; .endif
+  |  ret
+  |.else
+  |  pushfd
+  |  pop edx
+  |  mov ecx, edx
+  |  xor edx, 0x00200000		// Toggle ID bit in flags.
+  |  push edx
+  |  popfd
+  |  pushfd
+  |  pop edx
+  |  xor eax, eax			// Zero means no features supported.
+  |  cmp ecx, edx
+  |  jz >1				// No ID toggle means no CPUID support.
+  |  mov eax, [esp+4]			// Argument 1 is function number.
+  |  push edi
+  |  push ebx
+  |  xor ecx, ecx
+  |  cpuid
+  |  mov edi, [esp+16]			// Argument 2 is result area.
+  |  mov [edi], eax
+  |  mov [edi+4], ebx
+  |  mov [edi+8], ecx
+  |  mov [edi+12], edx
+  |  pop ebx
+  |  pop edi
+  |1:
+  |  ret
+  |.endif
+  |
+  |.define NEXT_TAB,		TAB:FCARG1
+  |.define NEXT_IDX,		FCARG2
+  |.define NEXT_PTR,		RCa
+  |.define NEXT_PTRd,		RC
+  |.macro NEXT_RES_IDXL, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
+  |.if X64
+  |.define NEXT_TMP,		CARG3d
+  |.define NEXT_TMPq,		CARG3
+  |.define NEXT_ASIZE,		CARG4d
+  |.macro NEXT_ENTER;		.endmacro
+  |.macro NEXT_LEAVE;		ret; .endmacro
+  |.if X64WIN
+  |.define NEXT_RES_PTR,	[rsp+aword*5]
+  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
+  |.else
+  |.define NEXT_RES_PTR,	[rsp+aword*1]
+  |.macro NEXT_RES_IDX, op2;	lea edx, [NEXT_IDX+op2]; .endmacro
+  |.endif
+  |.else
+  |.define NEXT_ASIZE,		esi
+  |.define NEXT_TMP,		edi
+  |.macro NEXT_ENTER;		push esi; push edi; .endmacro
+  |.macro NEXT_LEAVE;		pop edi; pop esi; ret; .endmacro
+  |.define NEXT_RES_PTR,	[esp+dword*3]
+  |.macro NEXT_RES_IDX, op2;	add NEXT_IDX, op2; .endmacro
+  |.endif
+  |
+  |// TValue *lj_vm_next(GCtab *t, uint32_t idx)
+  |// Next idx returned in edx.
+  |->vm_next:
+  |.if JIT
+  |  NEXT_ENTER
+  |  mov NEXT_ASIZE, NEXT_TAB->asize
+  |1:  // Traverse array part.
+  |  cmp NEXT_IDX, NEXT_ASIZE;  jae >5
+  |  mov NEXT_TMP, NEXT_TAB->array
+  |  cmp dword [NEXT_TMP+NEXT_IDX*8+4], LJ_TNIL;  je >2
+  |  lea NEXT_PTR, NEXT_RES_PTR
+  |.if X64
+  |  mov NEXT_TMPq, qword [NEXT_TMP+NEXT_IDX*8]
+  |  mov qword [NEXT_PTR], NEXT_TMPq
+  |.else
+  |  mov NEXT_ASIZE, dword [NEXT_TMP+NEXT_IDX*8+4]
+  |  mov NEXT_TMP, dword [NEXT_TMP+NEXT_IDX*8]
+  |  mov dword [NEXT_PTR+4], NEXT_ASIZE
+  |  mov dword [NEXT_PTR], NEXT_TMP
+  |.endif
+  |.if DUALNUM
+  |  mov dword [NEXT_PTR+dword*3], LJ_TISNUM
+  |  mov dword [NEXT_PTR+dword*2], NEXT_IDX
+  |.else
+  |  cvtsi2sd xmm0, NEXT_IDX
+  |  movsd qword [NEXT_PTR+dword*2], xmm0
+  |.endif
+  |  NEXT_RES_IDX 1
+  |  NEXT_LEAVE
+  |2:  // Skip holes in array part.
+  |  add NEXT_IDX, 1
+  |  jmp <1
+  |
+  |5:  // Traverse hash part.
+  |  sub NEXT_IDX, NEXT_ASIZE
+  |6:
+  |  cmp NEXT_IDX, NEXT_TAB->hmask; ja >9
+  |  imul NEXT_PTRd, NEXT_IDX, #NODE
+  |  add NODE:NEXT_PTRd, dword NEXT_TAB->node
+  |  cmp dword NODE:NEXT_PTR->val.it, LJ_TNIL; je >7
+  |  NEXT_RES_IDXL NEXT_ASIZE+1
+  |  NEXT_LEAVE
+  |7:  // Skip holes in hash part.
+  |  add NEXT_IDX, 1
+  |  jmp <6
+  |
+  |9:  // End of iteration. Set the key to nil (not the value).
+  |  NEXT_RES_IDX NEXT_ASIZE
+  |  lea NEXT_PTR, NEXT_RES_PTR
+  |  mov dword [NEXT_PTR+dword*3], LJ_TNIL
+  |  NEXT_LEAVE
+  |.endif
+  |
+  |//-----------------------------------------------------------------------
+  |//-- Assertions ---------------------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |->assert_bad_for_arg_type:
+#ifdef LUA_USE_ASSERT
+  |  int3
+#endif
+  |  int3
+  |
+  |//-----------------------------------------------------------------------
+  |//-- FFI helper functions -----------------------------------------------
+  |//-----------------------------------------------------------------------
+  |
+  |// Handler for callback functions. Callback slot number in ah/al.
+  |->vm_ffi_callback:
+  |.if FFI
+  |.type CTSTATE, CTState, PC
+  |.if not X64
+  |  sub esp, 16			// Leave room for SAVE_ERRF etc.
+  |.endif
+  |  saveregs_	// ebp/rbp already saved. ebp now holds global_State *.
+  |  lea DISPATCH, [ebp+GG_G2DISP]
+  |  mov CTSTATE, GL:ebp->ctype_state
+  |  movzx eax, ax
+  |  mov CTSTATE->cb.slot, eax
+  |.if X64
+  |  mov CTSTATE->cb.gpr[0], CARG1
+  |  mov CTSTATE->cb.gpr[1], CARG2
+  |  mov CTSTATE->cb.gpr[2], CARG3
+  |  mov CTSTATE->cb.gpr[3], CARG4
+  |  movsd qword CTSTATE->cb.fpr[0], xmm0
+  |  movsd qword CTSTATE->cb.fpr[1], xmm1
+  |  movsd qword CTSTATE->cb.fpr[2], xmm2
+  |  movsd qword CTSTATE->cb.fpr[3], xmm3
+  |.if X64WIN
+  |  lea rax, [rsp+CFRAME_SIZE+4*8]
+  |.else
+  |  lea rax, [rsp+CFRAME_SIZE]
+  |  mov CTSTATE->cb.gpr[4], CARG5
+  |  mov CTSTATE->cb.gpr[5], CARG6
+  |  movsd qword CTSTATE->cb.fpr[4], xmm4
+  |  movsd qword CTSTATE->cb.fpr[5], xmm5
+  |  movsd qword CTSTATE->cb.fpr[6], xmm6
+  |  movsd qword CTSTATE->cb.fpr[7], xmm7
+  |.endif
+  |  mov CTSTATE->cb.stack, rax
+  |  mov CARG2, rsp
+  |.else
+  |  lea eax, [esp+CFRAME_SIZE+16]
+  |  mov CTSTATE->cb.gpr[0], FCARG1
+  |  mov CTSTATE->cb.gpr[1], FCARG2
+  |  mov CTSTATE->cb.stack, eax
+  |  mov FCARG1, [esp+CFRAME_SIZE+12]	// Move around misplaced retaddr/ebp.
+  |  mov FCARG2, [esp+CFRAME_SIZE+8]
+  |  mov SAVE_RET, FCARG1
+  |  mov SAVE_R4, FCARG2
+  |  mov FCARG2, esp
+  |.endif
+  |  mov SAVE_PC, CTSTATE		// Any value outside of bytecode is ok.
+  |  mov FCARG1, CTSTATE
+  |  call extern lj_ccallback_enter@8	// (CTState *cts, void *cf)
+  |  // lua_State * returned in eax (RD).
+  |  set_vmstate INTERP
+  |  mov BASE, L:RD->base
+  |  mov RD, L:RD->top
+  |  sub RD, BASE
+  |  mov LFUNC:RB, [BASE-8]
+  |  shr RD, 3
+  |  add RD, 1
+  |  ins_callt
+  |.endif
+  |
+  |->cont_ffi_callback:			// Return from FFI callback.
+  |.if FFI
+  |  mov L:RA, SAVE_L
+  |  mov CTSTATE, [DISPATCH+DISPATCH_GL(ctype_state)]
+  |  mov aword CTSTATE->L, L:RAa
+  |  mov L:RA->base, BASE
+  |  mov L:RA->top, RB
+  |  mov FCARG1, CTSTATE
+  |  mov FCARG2, RC
+  |  call extern lj_ccallback_leave@8	// (CTState *cts, TValue *o)
+  |.if X64
+  |  mov rax, CTSTATE->cb.gpr[0]
+  |  movsd xmm0, qword CTSTATE->cb.fpr[0]
+  |  jmp ->vm_leave_unw
+  |.else
+  |  mov L:RB, SAVE_L
+  |  mov eax, CTSTATE->cb.gpr[0]
+  |  mov edx, CTSTATE->cb.gpr[1]
+  |  cmp dword CTSTATE->cb.gpr[2], 1
+  |  jb >7
+  |  je >6
+  |  fld qword CTSTATE->cb.fpr[0].d
+  |  jmp >7
+  |6:
+  |  fld dword CTSTATE->cb.fpr[0].f
+  |7:
+  |  mov ecx, L:RB->top
+  |  movzx ecx, word [ecx+6]		// Get stack adjustment and copy up.
+  |  mov SAVE_L, ecx			// Must be one slot above SAVE_RET
+  |  restoreregs
+  |  pop ecx				// Move return addr from SAVE_RET.
+  |  add esp, [esp]			// Adjust stack.
+  |  add esp, 16
+  |  push ecx
+  |  ret
+  |.endif
+  |.endif
+  |
+  |->vm_ffi_call@4:			// Call C function via FFI.
+  |  // Caveat: needs special frame unwinding, see below.
+  |.if FFI
+  |.if X64
+  |  .type CCSTATE, CCallState, rbx
+  |  push rbp; mov rbp, rsp; push rbx; mov CCSTATE, CARG1
+  |.else
+  |  .type CCSTATE, CCallState, ebx
+  |  push ebp; mov ebp, esp; push ebx; mov CCSTATE, FCARG1
+  |.endif
+  |
+  |  // Readjust stack.
+  |.if X64
+  |  mov eax, CCSTATE->spadj
+  |  sub rsp, rax
+  |.else
+  |  sub esp, CCSTATE->spadj
+  |.if WIN
+  |  mov CCSTATE->spadj, esp
+  |.endif
+  |.endif
+  |
+  |  // Copy stack slots.
+  |  movzx ecx, byte CCSTATE->nsp
+  |  sub ecx, 1
+  |  js >2
+  |1:
+  |.if X64
+  |  mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)]
+  |  mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax
+  |.else
+  |  mov eax, [CCSTATE+ecx*4+offsetof(CCallState, stack)]
+  |  mov [esp+ecx*4], eax
+  |.endif
+  |  sub ecx, 1
+  |  jns <1
+  |2:
+  |
+  |.if X64
+  |  movzx eax, byte CCSTATE->nfpr
+  |  mov CARG1, CCSTATE->gpr[0]
+  |  mov CARG2, CCSTATE->gpr[1]
+  |  mov CARG3, CCSTATE->gpr[2]
+  |  mov CARG4, CCSTATE->gpr[3]
+  |.if not X64WIN
+  |  mov CARG5, CCSTATE->gpr[4]
+  |  mov CARG6, CCSTATE->gpr[5]
+  |.endif
+  |  test eax, eax; jz >5
+  |  movaps xmm0, CCSTATE->fpr[0]
+  |  movaps xmm1, CCSTATE->fpr[1]
+  |  movaps xmm2, CCSTATE->fpr[2]
+  |  movaps xmm3, CCSTATE->fpr[3]
+  |.if not X64WIN
+  |  cmp eax, 4; jbe >5
+  |  movaps xmm4, CCSTATE->fpr[4]
+  |  movaps xmm5, CCSTATE->fpr[5]
+  |  movaps xmm6, CCSTATE->fpr[6]
+  |  movaps xmm7, CCSTATE->fpr[7]
+  |.endif
+  |5:
+  |.else
+  |  mov FCARG1, CCSTATE->gpr[0]
+  |  mov FCARG2, CCSTATE->gpr[1]
+  |.endif
+  |
+  |  call aword CCSTATE->func
+  |
+  |.if X64
+  |  mov CCSTATE->gpr[0], rax
+  |  movaps CCSTATE->fpr[0], xmm0
+  |.if not X64WIN
+  |  mov CCSTATE->gpr[1], rdx
+  |  movaps CCSTATE->fpr[1], xmm1
+  |.endif
+  |.else
+  |  mov CCSTATE->gpr[0], eax
+  |  mov CCSTATE->gpr[1], edx
+  |  cmp byte CCSTATE->resx87, 1
+  |  jb >7
+  |  je >6
+  |  fstp qword CCSTATE->fpr[0].d[0]
+  |  jmp >7
+  |6:
+  |  fstp dword CCSTATE->fpr[0].f[0]
+  |7:
+  |.if WIN
+  |  sub CCSTATE->spadj, esp
+  |.endif
+  |.endif
+  |
+  |.if X64
+  |  mov rbx, [rbp-8]; leave; ret
+  |.else
+  |  mov ebx, [ebp-4]; leave; ret
+  |.endif
+  |.endif
+  |// Note: vm_ffi_call must be the last function in this object file!
+  |
+  |//-----------------------------------------------------------------------
+}
+
+/* Generate the code for a single instruction. */
+static void build_ins(BuildCtx *ctx, BCOp op, int defop)
+{
+  int vk = 0;
+  |// Note: aligning all instructions does not pay off.
+  |=>defop:
+
+  switch (op) {
+
+  /* -- Comparison ops ---------------------------------------------------- */
+
+  /* Remember: all ops branch for a true comparison, fall through otherwise. */
+
+  |.macro jmp_comp, lt, ge, le, gt, target
+  ||switch (op) {
+  ||case BC_ISLT:
+  |   lt target
+  ||break;
+  ||case BC_ISGE:
+  |   ge target
+  ||break;
+  ||case BC_ISLE:
+  |   le target
+  ||break;
+  ||case BC_ISGT:
+  |   gt target
+  ||break;
+  ||default: break;  /* Shut up GCC. */
+  ||}
+  |.endmacro
+
+  case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT:
+    |  // RA = src1, RD = src2, JMP with RD = target
+    |  ins_AD
+    |.if DUALNUM
+    |  checkint RA, >7
+    |  checkint RD, >8
+    |  mov RB, dword [BASE+RA*8]
+    |  add PC, 4
+    |  cmp RB, dword [BASE+RD*8]
+    |  jmp_comp jge, jl, jg, jle, >9
+    |6:
+    |  movzx RD, PC_RD
+    |  branchPC RD
+    |9:
+    |  ins_next
+    |
+    |7:  // RA is not an integer.
+    |  ja ->vmeta_comp
+    |  // RA is a number.
+    |  cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
+    |  // RA is a number, RD is an integer.
+    |  cvtsi2sd xmm0, dword [BASE+RD*8]
+    |  jmp >2
+    |
+    |8:  // RA is an integer, RD is not an integer.
+    |  ja ->vmeta_comp
+    |  // RA is an integer, RD is a number.
+    |  cvtsi2sd xmm1, dword [BASE+RA*8]
+    |  movsd xmm0, qword [BASE+RD*8]
+    |  add PC, 4
+    |  ucomisd xmm0, xmm1
+    |  jmp_comp jbe, ja, jb, jae, <9
+    |  jmp <6
+    |.else
+    |  checknum RA, ->vmeta_comp
+    |  checknum RD, ->vmeta_comp
+    |.endif
+    |1:
+    |  movsd xmm0, qword [BASE+RD*8]
+    |2:
+    |  add PC, 4
+    |  ucomisd xmm0, qword [BASE+RA*8]
+    |3:
+    |  // Unordered: all of ZF CF PF set, ordered: PF clear.
+    |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
+    |.if DUALNUM
+    |  jmp_comp jbe, ja, jb, jae, <9
+    |  jmp <6
+    |.else
+    |  jmp_comp jbe, ja, jb, jae, >1
+    |  movzx RD, PC_RD
+    |  branchPC RD
+    |1:
+    |  ins_next
+    |.endif
+    break;
+
+  case BC_ISEQV: case BC_ISNEV:
+    vk = op == BC_ISEQV;
+    |  ins_AD	// RA = src1, RD = src2, JMP with RD = target
+    |  mov RB, [BASE+RD*8+4]
+    |  add PC, 4
+    |.if DUALNUM
+    |  cmp RB, LJ_TISNUM; jne >7
+    |  checkint RA, >8
+    |  mov RB, dword [BASE+RD*8]
+    |  cmp RB, dword [BASE+RA*8]
+    if (vk) {
+      |  jne >9
+    } else {
+      |  je >9
+    }
+    |  movzx RD, PC_RD
+    |  branchPC RD
+    |9:
+    |  ins_next
+    |
+    |7:  // RD is not an integer.
+    |  ja >5
+    |  // RD is a number.
+    |  cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
+    |  // RD is a number, RA is an integer.
+    |  cvtsi2sd xmm0, dword [BASE+RA*8]
+    |  jmp >2
+    |
+    |8:  // RD is an integer, RA is not an integer.
+    |  ja >5
+    |  // RD is an integer, RA is a number.
+    |  cvtsi2sd xmm0, dword [BASE+RD*8]
+    |  ucomisd xmm0, qword [BASE+RA*8]
+    |  jmp >4
+    |
+    |.else
+    |  cmp RB, LJ_TISNUM; jae >5
+    |  checknum RA, >5
+    |.endif
+    |1:
+    |  movsd xmm0, qword [BASE+RA*8]
+    |2:
+    |  ucomisd xmm0, qword [BASE+RD*8]
+    |4:
+  iseqne_fp:
+    if (vk) {
+      |  jp >2				// Unordered means not equal.
+      |  jne >2
+    } else {
+      |  jp >2				// Unordered means not equal.
+      |  je >1
+    }
+  iseqne_end:
+    if (vk) {
+      |1:				// EQ: Branch to the target.
+      |  movzx RD, PC_RD
+      |  branchPC RD
+      |2:				// NE: Fallthrough to next instruction.
+      |.if not FFI
+      |3:
+      |.endif
+    } else {
+      |.if not FFI
+      |3:
+      |.endif
+      |2:				// NE: Branch to the target.
+      |  movzx RD, PC_RD
+      |  branchPC RD
+      |1:				// EQ: Fallthrough to next instruction.
+    }
+    if (LJ_DUALNUM && (op == BC_ISEQV || op == BC_ISNEV ||
+		       op == BC_ISEQN || op == BC_ISNEN)) {
+      |  jmp <9
+    } else {
+      |  ins_next
+    }
+    |
+    if (op == BC_ISEQV || op == BC_ISNEV) {
+      |5:  // Either or both types are not numbers.
+      |.if FFI
+      |  cmp RB, LJ_TCDATA; je ->vmeta_equal_cd
+      |  checktp RA, LJ_TCDATA; je ->vmeta_equal_cd
+      |.endif
+      |  checktp RA, RB			// Compare types.
+      |  jne <2				// Not the same type?
+      |  cmp RB, LJ_TISPRI
+      |  jae <1				// Same type and primitive type?
+      |
+      |  // Same types and not a primitive type. Compare GCobj or pvalue.
+      |  mov RA, [BASE+RA*8]
+      |  mov RD, [BASE+RD*8]
+      |  cmp RA, RD
+      |  je <1				// Same GCobjs or pvalues?
+      |  cmp RB, LJ_TISTABUD
+      |  ja <2				// Different objects and not table/ud?
+      |.if X64
+      |  cmp RB, LJ_TUDATA		// And not 64 bit lightuserdata.
+      |  jb <2
+      |.endif
+      |
+      |  // Different tables or userdatas. Need to check __eq metamethod.
+      |  // Field metatable must be at same offset for GCtab and GCudata!
+      |  mov TAB:RB, TAB:RA->metatable
+      |  test TAB:RB, TAB:RB
+      |  jz <2				// No metatable?
+      |  test byte TAB:RB->nomm, 1<<MM_eq
+      |  jnz <2				// Or 'no __eq' flag set?
+      if (vk) {
+	|  xor RB, RB			// ne = 0
+      } else {
+	|  mov RB, 1			// ne = 1
+      }
+      |  jmp ->vmeta_equal		// Handle __eq metamethod.
+    } else {
+      |.if FFI
+      |3:
+      |  cmp RB, LJ_TCDATA
+      if (LJ_DUALNUM && vk) {
+	|  jne <9
+      } else {
+	|  jne <2
+      }
+      |  jmp ->vmeta_equal_cd
+      |.endif
+    }
+    break;
+  case BC_ISEQS: case BC_ISNES:
+    vk = op == BC_ISEQS;
+    |  ins_AND	// RA = src, RD = str const, JMP with RD = target
+    |  mov RB, [BASE+RA*8+4]
+    |  add PC, 4
+    |  cmp RB, LJ_TSTR; jne >3
+    |  mov RA, [BASE+RA*8]
+    |  cmp RA, [KBASE+RD*4]
+  iseqne_test:
+    if (vk) {
+      |  jne >2
+    } else {
+      |  je >1
+    }
+    goto iseqne_end;
+  case BC_ISEQN: case BC_ISNEN:
+    vk = op == BC_ISEQN;
+    |  ins_AD	// RA = src, RD = num const, JMP with RD = target
+    |  mov RB, [BASE+RA*8+4]
+    |  add PC, 4
+    |.if DUALNUM
+    |  cmp RB, LJ_TISNUM; jne >7
+    |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jne >8
+    |  mov RB, dword [KBASE+RD*8]
+    |  cmp RB, dword [BASE+RA*8]
+    if (vk) {
+      |  jne >9
+    } else {
+      |  je >9
+    }
+    |  movzx RD, PC_RD
+    |  branchPC RD
+    |9:
+    |  ins_next
+    |
+    |7:  // RA is not an integer.
+    |  ja >3
+    |  // RA is a number.
+    |  cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
+    |  // RA is a number, RD is an integer.
+    |  cvtsi2sd xmm0, dword [KBASE+RD*8]
+    |  jmp >2
+    |
+    |8:  // RA is an integer, RD is a number.
+    |  cvtsi2sd xmm0, dword [BASE+RA*8]
+    |  ucomisd xmm0, qword [KBASE+RD*8]
+    |  jmp >4
+    |.else
+    |  cmp RB, LJ_TISNUM; jae >3
+    |.endif
+    |1:
+    |  movsd xmm0, qword [KBASE+RD*8]
+    |2:
+    |  ucomisd xmm0, qword [BASE+RA*8]
+    |4:
+    goto iseqne_fp;
+  case BC_ISEQP: case BC_ISNEP:
+    vk = op == BC_ISEQP;
+    |  ins_AND	// RA = src, RD = primitive type (~), JMP with RD = target
+    |  mov RB, [BASE+RA*8+4]
+    |  add PC, 4
+    |  cmp RB, RD
+    if (!LJ_HASFFI) goto iseqne_test;
+    if (vk) {
+      |  jne >3
+      |  movzx RD, PC_RD
+      |  branchPC RD
+      |2:
+      |  ins_next
+      |3:
+      |  cmp RB, LJ_TCDATA; jne <2
+      |  jmp ->vmeta_equal_cd
+    } else {
+      |  je >2
+      |  cmp RB, LJ_TCDATA; je ->vmeta_equal_cd
+      |  movzx RD, PC_RD
+      |  branchPC RD
+      |2:
+      |  ins_next
+    }
+    break;
+
+  /* -- Unary test and copy ops ------------------------------------------- */
+
+  case BC_ISTC: case BC_ISFC: case BC_IST: case BC_ISF:
+    |  ins_AD	// RA = dst or unused, RD = src, JMP with RD = target
+    |  mov RB, [BASE+RD*8+4]
+    |  add PC, 4
+    |  cmp RB, LJ_TISTRUECOND
+    if (op == BC_IST || op == BC_ISTC) {
+      |  jae >1
+    } else {
+      |  jb >1
+    }
+    if (op == BC_ISTC || op == BC_ISFC) {
+      |  mov [BASE+RA*8+4], RB
+      |  mov RB, [BASE+RD*8]
+      |  mov [BASE+RA*8], RB
+    }
+    |  movzx RD, PC_RD
+    |  branchPC RD
+    |1:					// Fallthrough to the next instruction.
+    |  ins_next
+    break;
+
+  case BC_ISTYPE:
+    |  ins_AD	// RA = src, RD = -type
+    |  add RD, [BASE+RA*8+4]
+    |  jne ->vmeta_istype
+    |  ins_next
+    break;
+  case BC_ISNUM:
+    |  ins_AD	// RA = src, RD = -(TISNUM-1)
+    |  checknum RA, ->vmeta_istype
+    |  ins_next
+    break;
+
+  /* -- Unary ops --------------------------------------------------------- */
+
+  case BC_MOV:
+    |  ins_AD	// RA = dst, RD = src
+    |.if X64
+    |  mov RBa, [BASE+RD*8]
+    |  mov [BASE+RA*8], RBa
+    |.else
+    |  mov RB, [BASE+RD*8+4]
+    |  mov RD, [BASE+RD*8]
+    |  mov [BASE+RA*8+4], RB
+    |  mov [BASE+RA*8], RD
+    |.endif
+    |  ins_next_
+    break;
+  case BC_NOT:
+    |  ins_AD	// RA = dst, RD = src
+    |  xor RB, RB
+    |  checktp RD, LJ_TISTRUECOND
+    |  adc RB, LJ_TTRUE
+    |  mov [BASE+RA*8+4], RB
+    |  ins_next
+    break;
+  case BC_UNM:
+    |  ins_AD	// RA = dst, RD = src
+    |.if DUALNUM
+    |  checkint RD, >5
+    |  mov RB, [BASE+RD*8]
+    |  neg RB
+    |  jo >4
+    |  mov dword [BASE+RA*8+4], LJ_TISNUM
+    |  mov dword [BASE+RA*8], RB
+    |9:
+    |  ins_next
+    |4:
+    |  mov dword [BASE+RA*8+4], 0x41e00000  // 2^31.
+    |  mov dword [BASE+RA*8], 0
+    |  jmp <9
+    |5:
+    |  ja ->vmeta_unm
+    |.else
+    |  checknum RD, ->vmeta_unm
+    |.endif
+    |  movsd xmm0, qword [BASE+RD*8]
+    |  sseconst_sign xmm1, RDa
+    |  xorps xmm0, xmm1
+    |  movsd qword [BASE+RA*8], xmm0
+    |.if DUALNUM
+    |  jmp <9
+    |.else
+    |  ins_next
+    |.endif
+    break;
+  case BC_LEN:
+    |  ins_AD	// RA = dst, RD = src
+    |  checkstr RD, >2
+    |  mov STR:RD, [BASE+RD*8]
+    |.if DUALNUM
+    |  mov RD, dword STR:RD->len
+    |1:
+    |  mov dword [BASE+RA*8+4], LJ_TISNUM
+    |  mov dword [BASE+RA*8], RD
+    |.else
+    |  xorps xmm0, xmm0
+    |  cvtsi2sd xmm0, dword STR:RD->len
+    |1:
+    |  movsd qword [BASE+RA*8], xmm0
+    |.endif
+    |  ins_next
+    |2:
+    |  checktab RD, ->vmeta_len
+    |  mov TAB:FCARG1, [BASE+RD*8]
+#if LJ_52
+    |  mov TAB:RB, TAB:FCARG1->metatable
+    |  cmp TAB:RB, 0
+    |  jnz >9
+    |3:
+#endif
+    |->BC_LEN_Z:
+    |  mov RB, BASE			// Save BASE.
+    |  call extern lj_tab_len@4		// (GCtab *t)
+    |  // Length of table returned in eax (RD).
+    |.if DUALNUM
+    |  // Nothing to do.
+    |.else
+    |  cvtsi2sd xmm0, RD
+    |.endif
+    |  mov BASE, RB			// Restore BASE.
+    |  movzx RA, PC_RA
+    |  jmp <1
+#if LJ_52
+    |9:  // Check for __len.
+    |  test byte TAB:RB->nomm, 1<<MM_len
+    |  jnz <3
+    |  jmp ->vmeta_len			// 'no __len' flag NOT set: check.
+#endif
+    break;
+
+  /* -- Binary ops -------------------------------------------------------- */
+
+    |.macro ins_arithpre, sseins, ssereg
+    |  ins_ABC
+    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+    ||switch (vk) {
+    ||case 0:
+    |   checknum RB, ->vmeta_arith_vn
+    |   .if DUALNUM
+    |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
+    |   .endif
+    |   movsd xmm0, qword [BASE+RB*8]
+    |   sseins ssereg, qword [KBASE+RC*8]
+    ||  break;
+    ||case 1:
+    |   checknum RB, ->vmeta_arith_nv
+    |   .if DUALNUM
+    |     cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
+    |   .endif
+    |   movsd xmm0, qword [KBASE+RC*8]
+    |   sseins ssereg, qword [BASE+RB*8]
+    ||  break;
+    ||default:
+    |   checknum RB, ->vmeta_arith_vv
+    |   checknum RC, ->vmeta_arith_vv
+    |   movsd xmm0, qword [BASE+RB*8]
+    |   sseins ssereg, qword [BASE+RC*8]
+    ||  break;
+    ||}
+    |.endmacro
+    |
+    |.macro ins_arithdn, intins
+    |  ins_ABC
+    ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
+    ||switch (vk) {
+    ||case 0:
+    |   checkint RB, ->vmeta_arith_vn
+    |   cmp dword [KBASE+RC*8+4], LJ_TISNUM; jne ->vmeta_arith_vn
+    |   mov RB, [BASE+RB*8]
+    |   intins RB, [KBASE+RC*8]; jo ->vmeta_arith_vno
+    ||  break;
+    ||case 1:
+    |   checkint RB, ->vmeta_arith_nv
+    |   cmp dword [KBASE+RC*8+4], LJ_TISNUM; jne ->vmeta_arith_nv
+    |   mov RC, [KBASE+RC*8]
+    |   intins RC, [BASE+RB*8]; jo ->vmeta_arith_nvo
+    ||  break;
+    ||default:
+    |   checkint RB, ->vmeta_arith_vv
+    |   checkint RC, ->vmeta_arith_vv
+    |   mov RB, [BASE+RB*8]
+    |   intins RB, [BASE+RC*8]; jo ->vmeta_arith_vvo
+    ||  break;
+    ||}
+    |  mov dword [BASE+RA*8+4], LJ_TISNUM
+    ||if (vk == 1) {
+    |   mov dword [BASE+RA*8], RC
+    ||} else {
+    |   mov dword [BASE+RA*8], RB
+    ||}
+    |  ins_next
+    |.endmacro
+    |
+    |.macro ins_arithpost
+    |  movsd qword [BASE+RA*8], xmm0
+    |.endmacro
+    |
+    |.macro ins_arith, sseins
+    |  ins_arithpre sseins, xmm0
+    |  ins_arithpost
+    |  ins_next
+    |.endmacro
+    |
+    |.macro ins_arith, intins, sseins
+    |.if DUALNUM
+    |  ins_arithdn intins
+    |.else
+    |  ins_arith, sseins
+    |.endif
+    |.endmacro
+
+    |  // RA = dst, RB = src1 or num const, RC = src2 or num const
+  case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
+    |  ins_arith add, addsd
+    break;
+  case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
+    |  ins_arith sub, subsd
+    break;
+  case BC_MULVN: case BC_MULNV: case BC_MULVV:
+    |  ins_arith imul, mulsd
+    break;
+  case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
+    |  ins_arith divsd
+    break;
+  case BC_MODVN:
+    |  ins_arithpre movsd, xmm1
+    |->BC_MODVN_Z:
+    |  call ->vm_mod
+    |  ins_arithpost
+    |  ins_next
+    break;
+  case BC_MODNV: case BC_MODVV:
+    |  ins_arithpre movsd, xmm1
+    |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
+    break;
+  case BC_POW:
+    |  ins_arithpre movsd, xmm1
+    |  mov RB, BASE
+    |.if not X64
+    |  movsd FPARG1, xmm0
+    |  movsd FPARG3, xmm1
+    |.endif
+    |  call extern pow
+    |  movzx RA, PC_RA
+    |  mov BASE, RB
+    |.if X64
+    |  ins_arithpost
+    |.else
+    |  fstp qword [BASE+RA*8]
+    |.endif
+    |  ins_next
+    break;
+
+  case BC_CAT:
+    |  ins_ABC	// RA = dst, RB = src_start, RC = src_end
+    |.if X64
+    |  mov L:CARG1d, SAVE_L
+    |  mov L:CARG1d->base, BASE
+    |  lea CARG2d, [BASE+RC*8]
+    |  mov CARG3d, RC
+    |  sub CARG3d, RB
+    |->BC_CAT_Z:
+    |  mov L:RB, L:CARG1d
+    |.else
+    |  lea RA, [BASE+RC*8]
+    |  sub RC, RB
+    |  mov ARG2, RA
+    |  mov ARG3, RC
+    |->BC_CAT_Z:
+    |  mov L:RB, SAVE_L
+    |  mov ARG1, L:RB
+    |  mov L:RB->base, BASE
+    |.endif
+    |  mov SAVE_PC, PC
+    |  call extern lj_meta_cat		// (lua_State *L, TValue *top, int left)
+    |  // NULL (finished) or TValue * (metamethod) returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  test RC, RC
+    |  jnz ->vmeta_binop
+    |  movzx RB, PC_RB			// Copy result to Stk[RA] from Stk[RB].
+    |  movzx RA, PC_RA
+    |.if X64
+    |  mov RCa, [BASE+RB*8]
+    |  mov [BASE+RA*8], RCa
+    |.else
+    |  mov RC, [BASE+RB*8+4]
+    |  mov RB, [BASE+RB*8]
+    |  mov [BASE+RA*8+4], RC
+    |  mov [BASE+RA*8], RB
+    |.endif
+    |  ins_next
+    break;
+
+  /* -- Constant ops ------------------------------------------------------ */
+
+  case BC_KSTR:
+    |  ins_AND	// RA = dst, RD = str const (~)
+    |  mov RD, [KBASE+RD*4]
+    |  mov dword [BASE+RA*8+4], LJ_TSTR
+    |  mov [BASE+RA*8], RD
+    |  ins_next
+    break;
+  case BC_KCDATA:
+    |.if FFI
+    |  ins_AND	// RA = dst, RD = cdata const (~)
+    |  mov RD, [KBASE+RD*4]
+    |  mov dword [BASE+RA*8+4], LJ_TCDATA
+    |  mov [BASE+RA*8], RD
+    |  ins_next
+    |.endif
+    break;
+  case BC_KSHORT:
+    |  ins_AD	// RA = dst, RD = signed int16 literal
+    |.if DUALNUM
+    |  movsx RD, RDW
+    |  mov dword [BASE+RA*8+4], LJ_TISNUM
+    |  mov dword [BASE+RA*8], RD
+    |.else
+    |  movsx RD, RDW			// Sign-extend literal.
+    |  cvtsi2sd xmm0, RD
+    |  movsd qword [BASE+RA*8], xmm0
+    |.endif
+    |  ins_next
+    break;
+  case BC_KNUM:
+    |  ins_AD	// RA = dst, RD = num const
+    |  movsd xmm0, qword [KBASE+RD*8]
+    |  movsd qword [BASE+RA*8], xmm0
+    |  ins_next
+    break;
+  case BC_KPRI:
+    |  ins_AND	// RA = dst, RD = primitive type (~)
+    |  mov [BASE+RA*8+4], RD
+    |  ins_next
+    break;
+  case BC_KNIL:
+    |  ins_AD	// RA = dst_start, RD = dst_end
+    |  lea RA, [BASE+RA*8+12]
+    |  lea RD, [BASE+RD*8+4]
+    |  mov RB, LJ_TNIL
+    |  mov [RA-8], RB			// Sets minimum 2 slots.
+    |1:
+    |  mov [RA], RB
+    |  add RA, 8
+    |  cmp RA, RD
+    |  jbe <1
+    |  ins_next
+    break;
+
+  /* -- Upvalue and function ops ------------------------------------------ */
+
+  case BC_UGET:
+    |  ins_AD	// RA = dst, RD = upvalue #
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov UPVAL:RB, [LFUNC:RB+RD*4+offsetof(GCfuncL, uvptr)]
+    |  mov RB, UPVAL:RB->v
+    |.if X64
+    |  mov RDa, [RB]
+    |  mov [BASE+RA*8], RDa
+    |.else
+    |  mov RD, [RB+4]
+    |  mov RB, [RB]
+    |  mov [BASE+RA*8+4], RD
+    |  mov [BASE+RA*8], RB
+    |.endif
+    |  ins_next
+    break;
+  case BC_USETV:
+#define TV2MARKOFS \
+ ((int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv))
+    |  ins_AD	// RA = upvalue #, RD = src
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  cmp byte UPVAL:RB->closed, 0
+    |  mov RB, UPVAL:RB->v
+    |  mov RA, [BASE+RD*8]
+    |  mov RD, [BASE+RD*8+4]
+    |  mov [RB], RA
+    |  mov [RB+4], RD
+    |  jz >1
+    |  // Check barrier for closed upvalue.
+    |  test byte [RB+TV2MARKOFS], LJ_GC_BLACK		// isblack(uv)
+    |  jnz >2
+    |1:
+    |  ins_next
+    |
+    |2:  // Upvalue is black. Check if new value is collectable and white.
+    |  sub RD, LJ_TISGCV
+    |  cmp RD, LJ_TNUMX - LJ_TISGCV			// tvisgcv(v)
+    |  jbe <1
+    |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES	// iswhite(v)
+    |  jz <1
+    |  // Crossed a write barrier. Move the barrier forward.
+    |.if X64 and not X64WIN
+    |  mov FCARG2, RB
+    |  mov RB, BASE			// Save BASE.
+    |.else
+    |  xchg FCARG2, RB			// Save BASE (FCARG2 == BASE).
+    |.endif
+    |  lea GL:FCARG1, [DISPATCH+GG_DISP2G]
+    |  call extern lj_gc_barrieruv@8	// (global_State *g, TValue *tv)
+    |  mov BASE, RB			// Restore BASE.
+    |  jmp <1
+    break;
+#undef TV2MARKOFS
+  case BC_USETS:
+    |  ins_AND	// RA = upvalue #, RD = str const (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  mov GCOBJ:RA, [KBASE+RD*4]
+    |  mov RD, UPVAL:RB->v
+    |  mov [RD], GCOBJ:RA
+    |  mov dword [RD+4], LJ_TSTR
+    |  test byte UPVAL:RB->marked, LJ_GC_BLACK		// isblack(uv)
+    |  jnz >2
+    |1:
+    |  ins_next
+    |
+    |2:  // Check if string is white and ensure upvalue is closed.
+    |  test byte GCOBJ:RA->gch.marked, LJ_GC_WHITES	// iswhite(str)
+    |  jz <1
+    |  cmp byte UPVAL:RB->closed, 0
+    |  jz <1
+    |  // Crossed a write barrier. Move the barrier forward.
+    |  mov RB, BASE			// Save BASE (FCARG2 == BASE).
+    |  mov FCARG2, RD
+    |  lea GL:FCARG1, [DISPATCH+GG_DISP2G]
+    |  call extern lj_gc_barrieruv@8	// (global_State *g, TValue *tv)
+    |  mov BASE, RB			// Restore BASE.
+    |  jmp <1
+    break;
+  case BC_USETN:
+    |  ins_AD	// RA = upvalue #, RD = num const
+    |  mov LFUNC:RB, [BASE-8]
+    |  movsd xmm0, qword [KBASE+RD*8]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  mov RA, UPVAL:RB->v
+    |  movsd qword [RA], xmm0
+    |  ins_next
+    break;
+  case BC_USETP:
+    |  ins_AND	// RA = upvalue #, RD = primitive type (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
+    |  mov RA, UPVAL:RB->v
+    |  mov [RA+4], RD
+    |  ins_next
+    break;
+  case BC_UCLO:
+    |  ins_AD	// RA = level, RD = target
+    |  branchPC RD			// Do this first to free RD.
+    |  mov L:RB, SAVE_L
+    |  cmp dword L:RB->openupval, 0
+    |  je >1
+    |  mov L:RB->base, BASE
+    |  lea FCARG2, [BASE+RA*8]		// Caveat: FCARG2 == BASE
+    |  mov L:FCARG1, L:RB		// Caveat: FCARG1 == RA
+    |  call extern lj_func_closeuv@8	// (lua_State *L, TValue *level)
+    |  mov BASE, L:RB->base
+    |1:
+    |  ins_next
+    break;
+
+  case BC_FNEW:
+    |  ins_AND	// RA = dst, RD = proto const (~) (holding function prototype)
+    |.if X64
+    |  mov L:RB, SAVE_L
+    |  mov L:RB->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
+    |  mov CARG3d, [BASE-8]
+    |  mov CARG2d, [KBASE+RD*4]		// Fetch GCproto *.
+    |  mov CARG1d, L:RB
+    |.else
+    |  mov LFUNC:RA, [BASE-8]
+    |  mov PROTO:RD, [KBASE+RD*4]	// Fetch GCproto *.
+    |  mov L:RB, SAVE_L
+    |  mov ARG3, LFUNC:RA
+    |  mov ARG2, PROTO:RD
+    |  mov ARG1, L:RB
+    |  mov L:RB->base, BASE
+    |.endif
+    |  mov SAVE_PC, PC
+    |  // (lua_State *L, GCproto *pt, GCfuncL *parent)
+    |  call extern lj_func_newL_gc
+    |  // GCfuncL * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA
+    |  mov [BASE+RA*8], LFUNC:RC
+    |  mov dword [BASE+RA*8+4], LJ_TFUNC
+    |  ins_next
+    break;
+
+  /* -- Table ops --------------------------------------------------------- */
+
+  case BC_TNEW:
+    |  ins_AD	// RA = dst, RD = hbits|asize
+    |  mov L:RB, SAVE_L
+    |  mov L:RB->base, BASE
+    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
+    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
+    |  mov SAVE_PC, PC
+    |  jae >5
+    |1:
+    |.if X64
+    |  mov CARG3d, RD
+    |  and RD, 0x7ff
+    |  shr CARG3d, 11
+    |.else
+    |  mov RA, RD
+    |  and RD, 0x7ff
+    |  shr RA, 11
+    |  mov ARG3, RA
+    |.endif
+    |  cmp RD, 0x7ff
+    |  je >3
+    |2:
+    |.if X64
+    |  mov L:CARG1d, L:RB
+    |  mov CARG2d, RD
+    |.else
+    |  mov ARG1, L:RB
+    |  mov ARG2, RD
+    |.endif
+    |  call extern lj_tab_new  // (lua_State *L, int32_t asize, uint32_t hbits)
+    |  // Table * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA
+    |  mov [BASE+RA*8], TAB:RC
+    |  mov dword [BASE+RA*8+4], LJ_TTAB
+    |  ins_next
+    |3:  // Turn 0x7ff into 0x801.
+    |  mov RD, 0x801
+    |  jmp <2
+    |5:
+    |  mov L:FCARG1, L:RB
+    |  call extern lj_gc_step_fixtop@4	// (lua_State *L)
+    |  movzx RD, PC_RD
+    |  jmp <1
+    break;
+  case BC_TDUP:
+    |  ins_AND	// RA = dst, RD = table const (~) (holding template table)
+    |  mov L:RB, SAVE_L
+    |  mov RA, [DISPATCH+DISPATCH_GL(gc.total)]
+    |  mov SAVE_PC, PC
+    |  cmp RA, [DISPATCH+DISPATCH_GL(gc.threshold)]
+    |  mov L:RB->base, BASE
+    |  jae >3
+    |2:
+    |  mov TAB:FCARG2, [KBASE+RD*4]	// Caveat: FCARG2 == BASE
+    |  mov L:FCARG1, L:RB		// Caveat: FCARG1 == RA
+    |  call extern lj_tab_dup@8		// (lua_State *L, Table *kt)
+    |  // Table * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA
+    |  mov [BASE+RA*8], TAB:RC
+    |  mov dword [BASE+RA*8+4], LJ_TTAB
+    |  ins_next
+    |3:
+    |  mov L:FCARG1, L:RB
+    |  call extern lj_gc_step_fixtop@4	// (lua_State *L)
+    |  movzx RD, PC_RD			// Need to reload RD.
+    |  not RDa
+    |  jmp <2
+    break;
+
+  case BC_GGET:
+    |  ins_AND	// RA = dst, RD = str const (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov TAB:RB, LFUNC:RB->env
+    |  mov STR:RC, [KBASE+RD*4]
+    |  jmp ->BC_TGETS_Z
+    break;
+  case BC_GSET:
+    |  ins_AND	// RA = src, RD = str const (~)
+    |  mov LFUNC:RB, [BASE-8]
+    |  mov TAB:RB, LFUNC:RB->env
+    |  mov STR:RC, [KBASE+RD*4]
+    |  jmp ->BC_TSETS_Z
+    break;
+
+  case BC_TGETV:
+    |  ins_ABC	// RA = dst, RB = table, RC = key
+    |  checktab RB, ->vmeta_tgetv
+    |  mov TAB:RB, [BASE+RB*8]
+    |
+    |  // Integer key?
+    |.if DUALNUM
+    |  checkint RC, >5
+    |  mov RC, dword [BASE+RC*8]
+    |.else
+    |  // Convert number to int and back and compare.
+    |  checknum RC, >5
+    |  movsd xmm0, qword [BASE+RC*8]
+    |  cvttsd2si RC, xmm0
+    |  cvtsi2sd xmm1, RC
+    |  ucomisd xmm0, xmm1
+    |  jne ->vmeta_tgetv		// Generic numeric key? Use fallback.
+    |.endif
+    |  cmp RC, TAB:RB->asize	// Takes care of unordered, too.
+    |  jae ->vmeta_tgetv		// Not in array part? Use fallback.
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
+    |  je >2
+    |  // Get array slot.
+    |.if X64
+    |  mov RBa, [RC]
+    |  mov [BASE+RA*8], RBa
+    |.else
+    |  mov RB, [RC]
+    |  mov RC, [RC+4]
+    |  mov [BASE+RA*8], RB
+    |  mov [BASE+RA*8+4], RC
+    |.endif
+    |1:
+    |  ins_next
+    |
+    |2:  // Check for __index if table value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz >3
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_index
+    |  jz ->vmeta_tgetv			// 'no __index' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |3:
+    |  mov dword [BASE+RA*8+4], LJ_TNIL
+    |  jmp <1
+    |
+    |5:  // String key?
+    |  checkstr RC, ->vmeta_tgetv
+    |  mov STR:RC, [BASE+RC*8]
+    |  jmp ->BC_TGETS_Z
+    break;
+  case BC_TGETS:
+    |  ins_ABC	// RA = dst, RB = table, RC = str const (~)
+    |  not RCa
+    |  mov STR:RC, [KBASE+RC*4]
+    |  checktab RB, ->vmeta_tgets
+    |  mov TAB:RB, [BASE+RB*8]
+    |->BC_TGETS_Z:	// RB = GCtab *, RC = GCstr *, refetches PC_RA.
+    |  mov RA, TAB:RB->hmask
+    |  and RA, STR:RC->sid
+    |  imul RA, #NODE
+    |  add NODE:RA, TAB:RB->node
+    |1:
+    |  cmp dword NODE:RA->key.it, LJ_TSTR
+    |  jne >4
+    |  cmp dword NODE:RA->key.gcr, STR:RC
+    |  jne >4
+    |  // Ok, key found. Assumes: offsetof(Node, val) == 0
+    |  cmp dword [RA+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
+    |  je >5				// Key found, but nil value?
+    |  movzx RC, PC_RA
+    |  // Get node value.
+    |.if X64
+    |  mov RBa, [RA]
+    |  mov [BASE+RC*8], RBa
+    |.else
+    |  mov RB, [RA]
+    |  mov RA, [RA+4]
+    |  mov [BASE+RC*8], RB
+    |  mov [BASE+RC*8+4], RA
+    |.endif
+    |2:
+    |  ins_next
+    |
+    |3:
+    |  movzx RC, PC_RA
+    |  mov dword [BASE+RC*8+4], LJ_TNIL
+    |  jmp <2
+    |
+    |4:  // Follow hash chain.
+    |  mov NODE:RA, NODE:RA->next
+    |  test NODE:RA, NODE:RA
+    |  jnz <1
+    |  // End of hash chain: key not found, nil result.
+    |
+    |5:  // Check for __index if table value is nil.
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test TAB:RA, TAB:RA
+    |  jz <3				// No metatable: done.
+    |  test byte TAB:RA->nomm, 1<<MM_index
+    |  jnz <3				// 'no __index' flag set: done.
+    |  jmp ->vmeta_tgets		// Caveat: preserve STR:RC.
+    break;
+  case BC_TGETB:
+    |  ins_ABC	// RA = dst, RB = table, RC = byte literal
+    |  checktab RB, ->vmeta_tgetb
+    |  mov TAB:RB, [BASE+RB*8]
+    |  cmp RC, TAB:RB->asize
+    |  jae ->vmeta_tgetb
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL	// Avoid overwriting RB in fastpath.
+    |  je >2
+    |  // Get array slot.
+    |.if X64
+    |  mov RBa, [RC]
+    |  mov [BASE+RA*8], RBa
+    |.else
+    |  mov RB, [RC]
+    |  mov RC, [RC+4]
+    |  mov [BASE+RA*8], RB
+    |  mov [BASE+RA*8+4], RC
+    |.endif
+    |1:
+    |  ins_next
+    |
+    |2:  // Check for __index if table value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz >3
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_index
+    |  jz ->vmeta_tgetb			// 'no __index' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |3:
+    |  mov dword [BASE+RA*8+4], LJ_TNIL
+    |  jmp <1
+    break;
+  case BC_TGETR:
+    |  ins_ABC	// RA = dst, RB = table, RC = key
+    |  mov TAB:RB, [BASE+RB*8]
+    |.if DUALNUM
+    |  mov RC, dword [BASE+RC*8]
+    |.else
+    |  cvttsd2si RC, qword [BASE+RC*8]
+    |.endif
+    |  cmp RC, TAB:RB->asize
+    |  jae ->vmeta_tgetr		// Not in array part? Use fallback.
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  // Get array slot.
+    |->BC_TGETR_Z:
+    |.if X64
+    |  mov RBa, [RC]
+    |  mov [BASE+RA*8], RBa
+    |.else
+    |  mov RB, [RC]
+    |  mov RC, [RC+4]
+    |  mov [BASE+RA*8], RB
+    |  mov [BASE+RA*8+4], RC
+    |.endif
+    |->BC_TGETR2_Z:
+    |  ins_next
+    break;
+
+  case BC_TSETV:
+    |  ins_ABC	// RA = src, RB = table, RC = key
+    |  checktab RB, ->vmeta_tsetv
+    |  mov TAB:RB, [BASE+RB*8]
+    |
+    |  // Integer key?
+    |.if DUALNUM
+    |  checkint RC, >5
+    |  mov RC, dword [BASE+RC*8]
+    |.else
+    |  // Convert number to int and back and compare.
+    |  checknum RC, >5
+    |  movsd xmm0, qword [BASE+RC*8]
+    |  cvttsd2si RC, xmm0
+    |  cvtsi2sd xmm1, RC
+    |  ucomisd xmm0, xmm1
+    |  jne ->vmeta_tsetv		// Generic numeric key? Use fallback.
+    |.endif
+    |  cmp RC, TAB:RB->asize		// Takes care of unordered, too.
+    |  jae ->vmeta_tsetv
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL
+    |  je >3				// Previous value is nil?
+    |1:
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |2:  // Set array slot.
+    |.if X64
+    |  mov RBa, [BASE+RA*8]
+    |  mov [RC], RBa
+    |.else
+    |  mov RB, [BASE+RA*8+4]
+    |  mov RA, [BASE+RA*8]
+    |  mov [RC+4], RB
+    |  mov [RC], RA
+    |.endif
+    |  ins_next
+    |
+    |3:  // Check for __newindex if previous value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <1
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsetv			// 'no __newindex' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1
+    |
+    |5:  // String key?
+    |  checkstr RC, ->vmeta_tsetv
+    |  mov STR:RC, [BASE+RC*8]
+    |  jmp ->BC_TSETS_Z
+    |
+    |7:  // Possible table write barrier for the value. Skip valiswhite check.
+    |  barrierback TAB:RB, RA
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <2
+    break;
+  case BC_TSETS:
+    |  ins_ABC	// RA = src, RB = table, RC = str const (~)
+    |  not RCa
+    |  mov STR:RC, [KBASE+RC*4]
+    |  checktab RB, ->vmeta_tsets
+    |  mov TAB:RB, [BASE+RB*8]
+    |->BC_TSETS_Z:	// RB = GCtab *, RC = GCstr *, refetches PC_RA.
+    |  mov RA, TAB:RB->hmask
+    |  and RA, STR:RC->sid
+    |  imul RA, #NODE
+    |  mov byte TAB:RB->nomm, 0		// Clear metamethod cache.
+    |  add NODE:RA, TAB:RB->node
+    |1:
+    |  cmp dword NODE:RA->key.it, LJ_TSTR
+    |  jne >5
+    |  cmp dword NODE:RA->key.gcr, STR:RC
+    |  jne >5
+    |  // Ok, key found. Assumes: offsetof(Node, val) == 0
+    |  cmp dword [RA+4], LJ_TNIL
+    |  je >4				// Previous value is nil?
+    |2:
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |3:  // Set node value.
+    |  movzx RC, PC_RA
+    |.if X64
+    |  mov RBa, [BASE+RC*8]
+    |  mov [RA], RBa
+    |.else
+    |  mov RB, [BASE+RC*8+4]
+    |  mov RC, [BASE+RC*8]
+    |  mov [RA+4], RB
+    |  mov [RA], RC
+    |.endif
+    |  ins_next
+    |
+    |4:  // Check for __newindex if previous value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <2
+    |  mov TMP1, RA			// Save RA.
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
+    |  mov RA, TMP1			// Restore RA.
+    |  jmp <2
+    |
+    |5:  // Follow hash chain.
+    |  mov NODE:RA, NODE:RA->next
+    |  test NODE:RA, NODE:RA
+    |  jnz <1
+    |  // End of hash chain: key not found, add a new one.
+    |
+    |  // But check for __newindex first.
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test TAB:RA, TAB:RA
+    |  jz >6				// No metatable: continue.
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsets			// 'no __newindex' flag NOT set: check.
+    |6:
+    |  mov TMP1, STR:RC
+    |  mov TMP2, LJ_TSTR
+    |  mov TMP3, TAB:RB			// Save TAB:RB for us.
+    |.if X64
+    |  mov L:CARG1d, SAVE_L
+    |  mov L:CARG1d->base, BASE
+    |  lea CARG3, TMP1
+    |  mov CARG2d, TAB:RB
+    |  mov L:RB, L:CARG1d
+    |.else
+    |  lea RC, TMP1			// Store temp. TValue in TMP1/TMP2.
+    |  mov ARG2, TAB:RB
+    |  mov L:RB, SAVE_L
+    |  mov ARG3, RC
+    |  mov ARG1, L:RB
+    |  mov L:RB->base, BASE
+    |.endif
+    |  mov SAVE_PC, PC
+    |  call extern lj_tab_newkey	// (lua_State *L, GCtab *t, TValue *k)
+    |  // Handles write barrier for the new key. TValue * returned in eax (RC).
+    |  mov BASE, L:RB->base
+    |  mov TAB:RB, TMP3			// Need TAB:RB for barrier.
+    |  mov RA, eax
+    |  jmp <2				// Must check write barrier for value.
+    |
+    |7:  // Possible table write barrier for the value. Skip valiswhite check.
+    |  barrierback TAB:RB, RC		// Destroys STR:RC.
+    |  jmp <3
+    break;
+  case BC_TSETB:
+    |  ins_ABC	// RA = src, RB = table, RC = byte literal
+    |  checktab RB, ->vmeta_tsetb
+    |  mov TAB:RB, [BASE+RB*8]
+    |  cmp RC, TAB:RB->asize
+    |  jae ->vmeta_tsetb
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  cmp dword [RC+4], LJ_TNIL
+    |  je >3				// Previous value is nil?
+    |1:
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |2:	 // Set array slot.
+    |.if X64
+    |  mov RAa, [BASE+RA*8]
+    |  mov [RC], RAa
+    |.else
+    |  mov RB, [BASE+RA*8+4]
+    |  mov RA, [BASE+RA*8]
+    |  mov [RC+4], RB
+    |  mov [RC], RA
+    |.endif
+    |  ins_next
+    |
+    |3:  // Check for __newindex if previous value is nil.
+    |  cmp dword TAB:RB->metatable, 0	// Shouldn't overwrite RA for fastpath.
+    |  jz <1
+    |  mov TAB:RA, TAB:RB->metatable
+    |  test byte TAB:RA->nomm, 1<<MM_newindex
+    |  jz ->vmeta_tsetb			// 'no __newindex' flag NOT set: check.
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1
+    |
+    |7:  // Possible table write barrier for the value. Skip valiswhite check.
+    |  barrierback TAB:RB, RA
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <2
+    break;
+  case BC_TSETR:
+    |  ins_ABC	// RA = src, RB = table, RC = key
+    |  mov TAB:RB, [BASE+RB*8]
+    |.if DUALNUM
+    |  mov RC, dword [BASE+RC*8]
+    |.else
+    |  cvttsd2si RC, qword [BASE+RC*8]
+    |.endif
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |2:
+    |  cmp RC, TAB:RB->asize
+    |  jae ->vmeta_tsetr
+    |  shl RC, 3
+    |  add RC, TAB:RB->array
+    |  // Set array slot.
+    |->BC_TSETR_Z:
+    |.if X64
+    |  mov RBa, [BASE+RA*8]
+    |  mov [RC], RBa
+    |.else
+    |  mov RB, [BASE+RA*8+4]
+    |  mov RA, [BASE+RA*8]
+    |  mov [RC+4], RB
+    |  mov [RC], RA
+    |.endif
+    |  ins_next
+    |
+    |7:  // Possible table write barrier for the value. Skip valiswhite check.
+    |  barrierback TAB:RB, RA
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <2
+    break;
+
+  case BC_TSETM:
+    |  ins_AD	// RA = base (table at base-1), RD = num const (start index)
+    |  mov TMP1, KBASE			// Need one more free register.
+    |  mov KBASE, dword [KBASE+RD*8]	// Integer constant is in lo-word.
+    |1:
+    |  lea RA, [BASE+RA*8]
+    |  mov TAB:RB, [RA-8]		// Guaranteed to be a table.
+    |  test byte TAB:RB->marked, LJ_GC_BLACK	// isblack(table)
+    |  jnz >7
+    |2:
+    |  mov RD, MULTRES
+    |  sub RD, 1
+    |  jz >4				// Nothing to copy?
+    |  add RD, KBASE			// Compute needed size.
+    |  cmp RD, TAB:RB->asize
+    |  ja >5				// Doesn't fit into array part?
+    |  sub RD, KBASE
+    |  shl KBASE, 3
+    |  add KBASE, TAB:RB->array
+    |3:  // Copy result slots to table.
+    |.if X64
+    |  mov RBa, [RA]
+    |  add RA, 8
+    |  mov [KBASE], RBa
+    |.else
+    |  mov RB, [RA]
+    |  mov [KBASE], RB
+    |  mov RB, [RA+4]
+    |  add RA, 8
+    |  mov [KBASE+4], RB
+    |.endif
+    |  add KBASE, 8
+    |  sub RD, 1
+    |  jnz <3
+    |4:
+    |  mov KBASE, TMP1
+    |  ins_next
+    |
+    |5:  // Need to resize array part.
+    |.if X64
+    |  mov L:CARG1d, SAVE_L
+    |  mov L:CARG1d->base, BASE		// Caveat: CARG2d/CARG3d may be BASE.
+    |  mov CARG2d, TAB:RB
+    |  mov CARG3d, RD
+    |  mov L:RB, L:CARG1d
+    |.else
+    |  mov ARG2, TAB:RB
+    |  mov L:RB, SAVE_L
+    |  mov L:RB->base, BASE
+    |  mov ARG3, RD
+    |  mov ARG1, L:RB
+    |.endif
+    |  mov SAVE_PC, PC
+    |  call extern lj_tab_reasize	// (lua_State *L, GCtab *t, int nasize)
+    |  mov BASE, L:RB->base
+    |  movzx RA, PC_RA			// Restore RA.
+    |  jmp <1				// Retry.
+    |
+    |7:  // Possible table write barrier for any value. Skip valiswhite check.
+    |  barrierback TAB:RB, RD
+    |  jmp <2
+    break;
+
+  /* -- Calls and vararg handling ----------------------------------------- */
+
+  case BC_CALL: case BC_CALLM:
+    |  ins_A_C	// RA = base, (RB = nresults+1,) RC = nargs+1 | extra_nargs
+    if (op == BC_CALLM) {
+      |  add NARGS:RD, MULTRES
+    }
+    |  cmp dword [BASE+RA*8+4], LJ_TFUNC
+    |  mov LFUNC:RB, [BASE+RA*8]
+    |  jne ->vmeta_call_ra
+    |  lea BASE, [BASE+RA*8+8]
+    |  ins_call
+    break;
+
+  case BC_CALLMT:
+    |  ins_AD	// RA = base, RD = extra_nargs
+    |  add NARGS:RD, MULTRES
+    |  // Fall through. Assumes BC_CALLT follows and ins_AD is a no-op.
+    break;
+  case BC_CALLT:
+    |  ins_AD	// RA = base, RD = nargs+1
+    |  lea RA, [BASE+RA*8+8]
+    |  mov KBASE, BASE			// Use KBASE for move + vmeta_call hint.
+    |  mov LFUNC:RB, [RA-8]
+    |  cmp dword [RA-4], LJ_TFUNC
+    |  jne ->vmeta_call
+    |->BC_CALLT_Z:
+    |  mov PC, [BASE-4]
+    |  test PC, FRAME_TYPE
+    |  jnz >7
+    |1:
+    |  mov [BASE-8], LFUNC:RB		// Copy function down, reloaded below.
+    |  mov MULTRES, NARGS:RD
+    |  sub NARGS:RD, 1
+    |  jz >3
+    |2:  // Move args down.
+    |.if X64
+    |  mov RBa, [RA]
+    |  add RA, 8
+    |  mov [KBASE], RBa
+    |.else
+    |  mov RB, [RA]
+    |  mov [KBASE], RB
+    |  mov RB, [RA+4]
+    |  add RA, 8
+    |  mov [KBASE+4], RB
+    |.endif
+    |  add KBASE, 8
+    |  sub NARGS:RD, 1
+    |  jnz <2
+    |
+    |  mov LFUNC:RB, [BASE-8]
+    |3:
+    |  mov NARGS:RD, MULTRES
+    |  cmp byte LFUNC:RB->ffid, 1	// (> FF_C) Calling a fast function?
+    |  ja >5
+    |4:
+    |  ins_callt
+    |
+    |5:  // Tailcall to a fast function.
+    |  test PC, FRAME_TYPE		// Lua frame below?
+    |  jnz <4
+    |  movzx RA, PC_RA
+    |  not RAa
+    |  mov LFUNC:KBASE, [BASE+RA*8-8]	// Need to prepare KBASE.
+    |  mov KBASE, LFUNC:KBASE->pc
+    |  mov KBASE, [KBASE+PC2PROTO(k)]
+    |  jmp <4
+    |
+    |7:  // Tailcall from a vararg function.
+    |  sub PC, FRAME_VARG
+    |  test PC, FRAME_TYPEP
+    |  jnz >8				// Vararg frame below?
+    |  sub BASE, PC			// Need to relocate BASE/KBASE down.
+    |  mov KBASE, BASE
+    |  mov PC, [BASE-4]
+    |  jmp <1
+    |8:
+    |  add PC, FRAME_VARG
+    |  jmp <1
+    break;
+
+  case BC_ITERC:
+    |  ins_A	// RA = base, (RB = nresults+1,) RC = nargs+1 (2+1)
+    |  lea RA, [BASE+RA*8+8]		// fb = base+1
+    |.if X64
+    |  mov RBa, [RA-24]			// Copy state. fb[0] = fb[-3].
+    |  mov RCa, [RA-16]			// Copy control var. fb[1] = fb[-2].
+    |  mov [RA], RBa
+    |  mov [RA+8], RCa
+    |.else
+    |  mov RB, [RA-24]			// Copy state. fb[0] = fb[-3].
+    |  mov RC, [RA-20]
+    |  mov [RA], RB
+    |  mov [RA+4], RC
+    |  mov RB, [RA-16]			// Copy control var. fb[1] = fb[-2].
+    |  mov RC, [RA-12]
+    |  mov [RA+8], RB
+    |  mov [RA+12], RC
+    |.endif
+    |  mov LFUNC:RB, [RA-32]		// Copy callable. fb[-1] = fb[-4]
+    |  mov RC, [RA-28]
+    |  mov [RA-8], LFUNC:RB
+    |  mov [RA-4], RC
+    |  cmp RC, LJ_TFUNC			// Handle like a regular 2-arg call.
+    |  mov NARGS:RD, 2+1
+    |  jne ->vmeta_call
+    |  mov BASE, RA
+    |  ins_call
+    break;
+
+  case BC_ITERN:
+    |.if JIT
+    |  hotloop RB
+    |.endif
+    |->vm_IITERN:
+    |  ins_A	// RA = base, (RB = nresults+1, RC = nargs+1 (2+1))
+    |  mov TMP1, KBASE			// Need two more free registers.
+    |  mov TMP2, DISPATCH
+    |  mov TAB:RB, [BASE+RA*8-16]
+    |  mov RC, [BASE+RA*8-8]		// Get index from control var.
+    |  mov DISPATCH, TAB:RB->asize
+    |  add PC, 4
+    |  mov KBASE, TAB:RB->array
+    |1:  // Traverse array part.
+    |  cmp RC, DISPATCH; jae >5		// Index points after array part?
+    |  cmp dword [KBASE+RC*8+4], LJ_TNIL; je >4
+    |.if DUALNUM
+    |  mov dword [BASE+RA*8+4], LJ_TISNUM
+    |  mov dword [BASE+RA*8], RC
+    |.else
+    |  cvtsi2sd xmm0, RC
+    |.endif
+    |  // Copy array slot to returned value.
+    |.if X64
+    |  mov RBa, [KBASE+RC*8]
+    |  mov [BASE+RA*8+8], RBa
+    |.else
+    |  mov RB, [KBASE+RC*8+4]
+    |  mov [BASE+RA*8+12], RB
+    |  mov RB, [KBASE+RC*8]
+    |  mov [BASE+RA*8+8], RB
+    |.endif
+    |  add RC, 1
+    |  // Return array index as a numeric key.
+    |.if DUALNUM
+    |  // See above.
+    |.else
+    |  movsd qword [BASE+RA*8], xmm0
+    |.endif
+    |  mov [BASE+RA*8-8], RC		// Update control var.
+    |2:
+    |  movzx RD, PC_RD			// Get target from ITERL.
+    |  branchPC RD
+    |3:
+    |  mov DISPATCH, TMP2
+    |  mov KBASE, TMP1
+    |  ins_next
+    |
+    |4:  // Skip holes in array part.
+    |  add RC, 1
+    |  jmp <1
+    |
+    |5:  // Traverse hash part.
+    |  sub RC, DISPATCH
+    |6:
+    |  cmp RC, TAB:RB->hmask; ja <3	// End of iteration? Branch to ITERL+1.
+    |  imul KBASE, RC, #NODE
+    |  add NODE:KBASE, TAB:RB->node
+    |  cmp dword NODE:KBASE->val.it, LJ_TNIL; je >7
+    |  lea DISPATCH, [RC+DISPATCH+1]
+    |  // Copy key and value from hash slot.
+    |.if X64
+    |  mov RBa, NODE:KBASE->key
+    |  mov RCa, NODE:KBASE->val
+    |  mov [BASE+RA*8], RBa
+    |  mov [BASE+RA*8+8], RCa
+    |.else
+    |  mov RB, NODE:KBASE->key.gcr
+    |  mov RC, NODE:KBASE->key.it
+    |  mov [BASE+RA*8], RB
+    |  mov [BASE+RA*8+4], RC
+    |  mov RB, NODE:KBASE->val.gcr
+    |  mov RC, NODE:KBASE->val.it
+    |  mov [BASE+RA*8+8], RB
+    |  mov [BASE+RA*8+12], RC
+    |.endif
+    |  mov [BASE+RA*8-8], DISPATCH
+    |  jmp <2
+    |
+    |7:  // Skip holes in hash part.
+    |  add RC, 1
+    |  jmp <6
+    break;
+
+  case BC_ISNEXT:
+    |  ins_AD	// RA = base, RD = target (points to ITERN)
+    |  cmp dword [BASE+RA*8-20], LJ_TFUNC; jne >5
+    |  mov CFUNC:RB, [BASE+RA*8-24]
+    |  cmp dword [BASE+RA*8-12], LJ_TTAB; jne >5
+    |  cmp dword [BASE+RA*8-4], LJ_TNIL; jne >5
+    |  cmp byte CFUNC:RB->ffid, FF_next_N; jne >5
+    |  branchPC RD
+    |  mov dword [BASE+RA*8-8], 0	// Initialize control var.
+    |  mov dword [BASE+RA*8-4], LJ_KEYINDEX
+    |1:
+    |  ins_next
+    |5:  // Despecialize bytecode if any of the checks fail.
+    |  mov PC_OP, BC_JMP
+    |  branchPC RD
+    |.if JIT
+    |  cmp byte [PC], BC_ITERN
+    |  jne >6
+    |.endif
+    |  mov byte [PC], BC_ITERC
+    |  jmp <1
+    |.if JIT
+    |6:  // Unpatch JLOOP.
+    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
+    |  movzx RC, word [PC+2]
+    |  mov TRACE:RA, [RA+RC*4]
+    |  mov eax, TRACE:RA->startins
+    |  mov al, BC_ITERC
+    |  mov dword [PC], eax
+    |  jmp <1
+    |.endif
+    break;
+
+  case BC_VARG:
+    |  ins_ABC	// RA = base, RB = nresults+1, RC = numparams
+    |  mov TMP1, KBASE			// Need one more free register.
+    |  lea KBASE, [BASE+RC*8+(8+FRAME_VARG)]
+    |  lea RA, [BASE+RA*8]
+    |  sub KBASE, [BASE-4]
+    |  // Note: KBASE may now be even _above_ BASE if nargs was < numparams.
+    |  test RB, RB
+    |  jz >5				// Copy all varargs?
+    |  lea RB, [RA+RB*8-8]
+    |  cmp KBASE, BASE			// No vararg slots?
+    |  jnb >2
+    |1:  // Copy vararg slots to destination slots.
+    |.if X64
+    |  mov RCa, [KBASE-8]
+    |  add KBASE, 8
+    |  mov [RA], RCa
+    |.else
+    |  mov RC, [KBASE-8]
+    |  mov [RA], RC
+    |  mov RC, [KBASE-4]
+    |  add KBASE, 8
+    |  mov [RA+4], RC
+    |.endif
+    |  add RA, 8
+    |  cmp RA, RB			// All destination slots filled?
+    |  jnb >3
+    |  cmp KBASE, BASE			// No more vararg slots?
+    |  jb <1
+    |2:  // Fill up remainder with nil.
+    |  mov dword [RA+4], LJ_TNIL
+    |  add RA, 8
+    |  cmp RA, RB
+    |  jb <2
+    |3:
+    |  mov KBASE, TMP1
+    |  ins_next
+    |
+    |5:  // Copy all varargs.
+    |  mov MULTRES, 1			// MULTRES = 0+1
+    |  mov RC, BASE
+    |  sub RC, KBASE
+    |  jbe <3				// No vararg slots?
+    |  mov RB, RC
+    |  shr RB, 3
+    |  add RB, 1
+    |  mov MULTRES, RB			// MULTRES = #varargs+1
+    |  mov L:RB, SAVE_L
+    |  add RC, RA
+    |  cmp RC, L:RB->maxstack
+    |  ja >7				// Need to grow stack?
+    |6:  // Copy all vararg slots.
+    |.if X64
+    |  mov RCa, [KBASE-8]
+    |  add KBASE, 8
+    |  mov [RA], RCa
+    |.else
+    |  mov RC, [KBASE-8]
+    |  mov [RA], RC
+    |  mov RC, [KBASE-4]
+    |  add KBASE, 8
+    |  mov [RA+4], RC
+    |.endif
+    |  add RA, 8
+    |  cmp KBASE, BASE			// No more vararg slots?
+    |  jb <6
+    |  jmp <3
+    |
+    |7:  // Grow stack for varargs.
+    |  mov L:RB->base, BASE
+    |  mov L:RB->top, RA
+    |  mov SAVE_PC, PC
+    |  sub KBASE, BASE			// Need delta, because BASE may change.
+    |  mov FCARG2, MULTRES
+    |  sub FCARG2, 1
+    |  mov FCARG1, L:RB
+    |  call extern lj_state_growstack@8	// (lua_State *L, int n)
+    |  mov BASE, L:RB->base
+    |  mov RA, L:RB->top
+    |  add KBASE, BASE
+    |  jmp <6
+    break;
+
+  /* -- Returns ----------------------------------------------------------- */
+
+  case BC_RETM:
+    |  ins_AD	// RA = results, RD = extra_nresults
+    |  add RD, MULTRES			// MULTRES >=1, so RD >=1.
+    |  // Fall through. Assumes BC_RET follows and ins_AD is a no-op.
+    break;
+
+  case BC_RET: case BC_RET0: case BC_RET1:
+    |  ins_AD	// RA = results, RD = nresults+1
+    if (op != BC_RET0) {
+      |  shl RA, 3
+    }
+    |1:
+    |  mov PC, [BASE-4]
+    |  mov MULTRES, RD			// Save nresults+1.
+    |  test PC, FRAME_TYPE		// Check frame type marker.
+    |  jnz >7				// Not returning to a fixarg Lua func?
+    switch (op) {
+    case BC_RET:
+      |->BC_RET_Z:
+      |  mov KBASE, BASE		// Use KBASE for result move.
+      |  sub RD, 1
+      |  jz >3
+      |2:  // Move results down.
+      |.if X64
+      |  mov RBa, [KBASE+RA]
+      |  mov [KBASE-8], RBa
+      |.else
+      |  mov RB, [KBASE+RA]
+      |  mov [KBASE-8], RB
+      |  mov RB, [KBASE+RA+4]
+      |  mov [KBASE-4], RB
+      |.endif
+      |  add KBASE, 8
+      |  sub RD, 1
+      |  jnz <2
+      |3:
+      |  mov RD, MULTRES		// Note: MULTRES may be >255.
+      |  movzx RB, PC_RB		// So cannot compare with RDL!
+      |5:
+      |  cmp RB, RD			// More results expected?
+      |  ja >6
+      break;
+    case BC_RET1:
+      |.if X64
+      |  mov RBa, [BASE+RA]
+      |  mov [BASE-8], RBa
+      |.else
+      |  mov RB, [BASE+RA+4]
+      |  mov [BASE-4], RB
+      |  mov RB, [BASE+RA]
+      |  mov [BASE-8], RB
+      |.endif
+      /* fallthrough */
+    case BC_RET0:
+      |5:
+      |  cmp PC_RB, RDL			// More results expected?
+      |  ja >6
+    default:
+      break;
+    }
+    |  movzx RA, PC_RA
+    |  not RAa				// Note: ~RA = -(RA+1)
+    |  lea BASE, [BASE+RA*8]		// base = base - (RA+1)*8
+    |  mov LFUNC:KBASE, [BASE-8]
+    |  mov KBASE, LFUNC:KBASE->pc
+    |  mov KBASE, [KBASE+PC2PROTO(k)]
+    |  ins_next
+    |
+    |6:  // Fill up results with nil.
+    if (op == BC_RET) {
+      |  mov dword [KBASE-4], LJ_TNIL	// Note: relies on shifted base.
+      |  add KBASE, 8
+    } else {
+      |  mov dword [BASE+RD*8-12], LJ_TNIL
+    }
+    |  add RD, 1
+    |  jmp <5
+    |
+    |7:  // Non-standard return case.
+    |  lea RB, [PC-FRAME_VARG]
+    |  test RB, FRAME_TYPEP
+    |  jnz ->vm_return
+    |  // Return from vararg function: relocate BASE down and RA up.
+    |  sub BASE, RB
+    if (op != BC_RET0) {
+      |  add RA, RB
+    }
+    |  jmp <1
+    break;
+
+  /* -- Loops and branches ------------------------------------------------ */
+
+  |.define FOR_IDX,  [RA];    .define FOR_TIDX,  dword [RA+4]
+  |.define FOR_STOP, [RA+8];  .define FOR_TSTOP, dword [RA+12]
+  |.define FOR_STEP, [RA+16]; .define FOR_TSTEP, dword [RA+20]
+  |.define FOR_EXT,  [RA+24]; .define FOR_TEXT,  dword [RA+28]
+
+  case BC_FORL:
+    |.if JIT
+    |  hotloop RB
+    |.endif
+    | // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
+    break;
+
+  case BC_JFORI:
+  case BC_JFORL:
+#if !LJ_HASJIT
+    break;
+#endif
+  case BC_FORI:
+  case BC_IFORL:
+    vk = (op == BC_IFORL || op == BC_JFORL);
+    |  ins_AJ	// RA = base, RD = target (after end of loop or start of loop)
+    |  lea RA, [BASE+RA*8]
+    if (LJ_DUALNUM) {
+      |  cmp FOR_TIDX, LJ_TISNUM; jne >9
+      if (!vk) {
+	|  cmp FOR_TSTOP, LJ_TISNUM; jne ->vmeta_for
+	|  cmp FOR_TSTEP, LJ_TISNUM; jne ->vmeta_for
+	|  mov RB, dword FOR_IDX
+	|  cmp dword FOR_STEP, 0; jl >5
+      } else {
+#ifdef LUA_USE_ASSERT
+	|  cmp FOR_TSTOP, LJ_TISNUM; jne ->assert_bad_for_arg_type
+	|  cmp FOR_TSTEP, LJ_TISNUM; jne ->assert_bad_for_arg_type
+#endif
+	|  mov RB, dword FOR_STEP
+	|  test RB, RB; js >5
+	|  add RB, dword FOR_IDX; jo >1
+	|  mov dword FOR_IDX, RB
+      }
+      |  cmp RB, dword FOR_STOP
+      |  mov FOR_TEXT, LJ_TISNUM
+      |  mov dword FOR_EXT, RB
+      if (op == BC_FORI) {
+	|  jle >7
+	|1:
+	|6:
+	|  branchPC RD
+      } else if (op == BC_JFORI) {
+	|  branchPC RD
+	|  movzx RD, PC_RD
+	|  jle =>BC_JLOOP
+	|1:
+	|6:
+      } else if (op == BC_IFORL) {
+	|  jg >7
+	|6:
+	|  branchPC RD
+	|1:
+      } else {
+	|  jle =>BC_JLOOP
+	|1:
+	|6:
+      }
+      |7:
+      |  ins_next
+      |
+      |5:  // Invert check for negative step.
+      if (vk) {
+	|  add RB, dword FOR_IDX; jo <1
+	|  mov dword FOR_IDX, RB
+      }
+      |  cmp RB, dword FOR_STOP
+      |  mov FOR_TEXT, LJ_TISNUM
+      |  mov dword FOR_EXT, RB
+      if (op == BC_FORI) {
+	|  jge <7
+      } else if (op == BC_JFORI) {
+	|  branchPC RD
+	|  movzx RD, PC_RD
+	|  jge =>BC_JLOOP
+      } else if (op == BC_IFORL) {
+	|  jl <7
+      } else {
+	|  jge =>BC_JLOOP
+      }
+      |  jmp <6
+      |9:  // Fallback to FP variant.
+    } else if (!vk) {
+      |  cmp FOR_TIDX, LJ_TISNUM
+    }
+    if (!vk) {
+      |  jae ->vmeta_for
+      |  cmp FOR_TSTOP, LJ_TISNUM; jae ->vmeta_for
+    } else {
+#ifdef LUA_USE_ASSERT
+      |  cmp FOR_TSTOP, LJ_TISNUM; jae ->assert_bad_for_arg_type
+      |  cmp FOR_TSTEP, LJ_TISNUM; jae ->assert_bad_for_arg_type
+#endif
+    }
+    |  mov RB, FOR_TSTEP		// Load type/hiword of for step.
+    if (!vk) {
+      |  cmp RB, LJ_TISNUM; jae ->vmeta_for
+    }
+    |  movsd xmm0, qword FOR_IDX
+    |  movsd xmm1, qword FOR_STOP
+    if (vk) {
+      |  addsd xmm0, qword FOR_STEP
+      |  movsd qword FOR_IDX, xmm0
+      |  test RB, RB; js >3
+    } else {
+      |  jl >3
+    }
+    |  ucomisd xmm1, xmm0
+    |1:
+    |  movsd qword FOR_EXT, xmm0
+    if (op == BC_FORI) {
+      |.if DUALNUM
+      |  jnb <7
+      |.else
+      |  jnb >2
+      |  branchPC RD
+      |.endif
+    } else if (op == BC_JFORI) {
+      |  branchPC RD
+      |  movzx RD, PC_RD
+      |  jnb =>BC_JLOOP
+    } else if (op == BC_IFORL) {
+      |.if DUALNUM
+      |  jb <7
+      |.else
+      |  jb >2
+      |  branchPC RD
+      |.endif
+    } else {
+      |  jnb =>BC_JLOOP
+    }
+    |.if DUALNUM
+    |  jmp <6
+    |.else
+    |2:
+    |  ins_next
+    |.endif
+    |
+    |3:  // Invert comparison if step is negative.
+    |  ucomisd xmm0, xmm1
+    |  jmp <1
+    break;
+
+  case BC_ITERL:
+    |.if JIT
+    |  hotloop RB
+    |.endif
+    | // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
+    break;
+
+  case BC_JITERL:
+#if !LJ_HASJIT
+    break;
+#endif
+  case BC_IITERL:
+    |  ins_AJ	// RA = base, RD = target
+    |  lea RA, [BASE+RA*8]
+    |  mov RB, [RA+4]
+    |  cmp RB, LJ_TNIL; je >1		// Stop if iterator returned nil.
+    if (op == BC_JITERL) {
+      |  mov [RA-4], RB
+      |  mov RB, [RA]
+      |  mov [RA-8], RB
+      |  jmp =>BC_JLOOP
+    } else {
+      |  branchPC RD			// Otherwise save control var + branch.
+      |  mov RD, [RA]
+      |  mov [RA-4], RB
+      |  mov [RA-8], RD
+    }
+    |1:
+    |  ins_next
+    break;
+
+  case BC_LOOP:
+    |  ins_A	// RA = base, RD = target (loop extent)
+    |  // Note: RA/RD is only used by trace recorder to determine scope/extent
+    |  // This opcode does NOT jump, it's only purpose is to detect a hot loop.
+    |.if JIT
+    |  hotloop RB
+    |.endif
+    | // Fall through. Assumes BC_ILOOP follows and ins_A is a no-op.
+    break;
+
+  case BC_ILOOP:
+    |  ins_A	// RA = base, RD = target (loop extent)
+    |  ins_next
+    break;
+
+  case BC_JLOOP:
+    |.if JIT
+    |  ins_AD	// RA = base (ignored), RD = traceno
+    |  mov RA, [DISPATCH+DISPATCH_J(trace)]
+    |  mov TRACE:RD, [RA+RD*4]
+    |  mov RDa, TRACE:RD->mcode
+    |  mov L:RB, SAVE_L
+    |  mov [DISPATCH+DISPATCH_GL(jit_base)], BASE
+    |  mov [DISPATCH+DISPATCH_GL(tmpbuf.L)], L:RB
+    |  // Save additional callee-save registers only used in compiled code.
+    |.if X64WIN
+    |  mov TMPQ, r12
+    |  mov TMPa, r13
+    |  mov CSAVE_4, r14
+    |  mov CSAVE_3, r15
+    |  mov RAa, rsp
+    |  sub rsp, 9*16+4*8
+    |  movdqa [RAa], xmm6
+    |  movdqa [RAa-1*16], xmm7
+    |  movdqa [RAa-2*16], xmm8
+    |  movdqa [RAa-3*16], xmm9
+    |  movdqa [RAa-4*16], xmm10
+    |  movdqa [RAa-5*16], xmm11
+    |  movdqa [RAa-6*16], xmm12
+    |  movdqa [RAa-7*16], xmm13
+    |  movdqa [RAa-8*16], xmm14
+    |  movdqa [RAa-9*16], xmm15
+    |.elif X64
+    |  mov TMPQ, r12
+    |  mov TMPa, r13
+    |  sub rsp, 16
+    |.endif
+    |  jmp RDa
+    |.endif
+    break;
+
+  case BC_JMP:
+    |  ins_AJ	// RA = unused, RD = target
+    |  branchPC RD
+    |  ins_next
+    break;
+
+  /* -- Function headers -------------------------------------------------- */
+
+   /*
+   ** Reminder: A function may be called with func/args above L->maxstack,
+   ** i.e. occupying EXTRA_STACK slots. And vmeta_call may add one extra slot,
+   ** too. This means all FUNC* ops (including fast functions) must check
+   ** for stack overflow _before_ adding more slots!
+   */
+
+  case BC_FUNCF:
+    |.if JIT
+    |  hotcall RB
+    |.endif
+  case BC_FUNCV:  /* NYI: compiled vararg functions. */
+    | // Fall through. Assumes BC_IFUNCF/BC_IFUNCV follow and ins_AD is a no-op.
+    break;
+
+  case BC_JFUNCF:
+#if !LJ_HASJIT
+    break;
+#endif
+  case BC_IFUNCF:
+    |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
+    |  mov KBASE, [PC-4+PC2PROTO(k)]
+    |  mov L:RB, SAVE_L
+    |  lea RA, [BASE+RA*8]		// Top of frame.
+    |  cmp RA, L:RB->maxstack
+    |  ja ->vm_growstack_f
+    |  movzx RA, byte [PC-4+PC2PROTO(numparams)]
+    |  cmp NARGS:RD, RA			// Check for missing parameters.
+    |  jbe >3
+    |2:
+    if (op == BC_JFUNCF) {
+      |  movzx RD, PC_RD
+      |  jmp =>BC_JLOOP
+    } else {
+      |  ins_next
+    }
+    |
+    |3:  // Clear missing parameters.
+    |  mov dword [BASE+NARGS:RD*8-4], LJ_TNIL
+    |  add NARGS:RD, 1
+    |  cmp NARGS:RD, RA
+    |  jbe <3
+    |  jmp <2
+    break;
+
+  case BC_JFUNCV:
+#if !LJ_HASJIT
+    break;
+#endif
+    | int3  // NYI: compiled vararg functions
+    break;  /* NYI: compiled vararg functions. */
+
+  case BC_IFUNCV:
+    |  ins_AD  // BASE = new base, RA = framesize, RD = nargs+1
+    |  lea RB, [NARGS:RD*8+FRAME_VARG]
+    |  lea RD, [BASE+NARGS:RD*8]
+    |  mov LFUNC:KBASE, [BASE-8]
+    |  mov [RD-4], RB			// Store delta + FRAME_VARG.
+    |  mov [RD-8], LFUNC:KBASE		// Store copy of LFUNC.
+    |  mov L:RB, SAVE_L
+    |  lea RA, [RD+RA*8]
+    |  cmp RA, L:RB->maxstack
+    |  ja ->vm_growstack_v		// Need to grow stack.
+    |  mov RA, BASE
+    |  mov BASE, RD
+    |  movzx RB, byte [PC-4+PC2PROTO(numparams)]
+    |  test RB, RB
+    |  jz >2
+    |1:  // Copy fixarg slots up to new frame.
+    |  add RA, 8
+    |  cmp RA, BASE
+    |  jnb >3				// Less args than parameters?
+    |  mov KBASE, [RA-8]
+    |  mov [RD], KBASE
+    |  mov KBASE, [RA-4]
+    |  mov [RD+4], KBASE
+    |  add RD, 8
+    |  mov dword [RA-4], LJ_TNIL	// Clear old fixarg slot (help the GC).
+    |  sub RB, 1
+    |  jnz <1
+    |2:
+    if (op == BC_JFUNCV) {
+      |  movzx RD, PC_RD
+      |  jmp =>BC_JLOOP
+    } else {
+      |  mov KBASE, [PC-4+PC2PROTO(k)]
+      |  ins_next
+    }
+    |
+    |3:  // Clear missing parameters.
+    |  mov dword [RD+4], LJ_TNIL
+    |  add RD, 8
+    |  sub RB, 1
+    |  jnz <3
+    |  jmp <2
+    break;
+
+  case BC_FUNCC:
+  case BC_FUNCCW:
+    |  ins_AD  // BASE = new base, RA = ins RA|RD (unused), RD = nargs+1
+    |  mov CFUNC:RB, [BASE-8]
+    |  mov KBASEa, CFUNC:RB->f
+    |  mov L:RB, SAVE_L
+    |  lea RD, [BASE+NARGS:RD*8-8]
+    |  mov L:RB->base, BASE
+    |  lea RA, [RD+8*LUA_MINSTACK]
+    |  cmp RA, L:RB->maxstack
+    |  mov L:RB->top, RD
+    if (op == BC_FUNCC) {
+      |.if X64
+      |  mov CARG1d, L:RB			// Caveat: CARG1d may be RA.
+      |.else
+      |  mov ARG1, L:RB
+      |.endif
+    } else {
+      |.if X64
+      |  mov CARG2, KBASEa
+      |  mov CARG1d, L:RB			// Caveat: CARG1d may be RA.
+      |.else
+      |  mov ARG2, KBASEa
+      |  mov ARG1, L:RB
+      |.endif
+    }
+    |  ja ->vm_growstack_c		// Need to grow stack.
+    |  set_vmstate C
+    if (op == BC_FUNCC) {
+      |  call KBASEa			// (lua_State *L)
+    } else {
+      |  // (lua_State *L, lua_CFunction f)
+      |  call aword [DISPATCH+DISPATCH_GL(wrapf)]
+    }
+    |  // nresults returned in eax (RD).
+    |  mov BASE, L:RB->base
+    |  mov [DISPATCH+DISPATCH_GL(cur_L)], L:RB
+    |  set_vmstate INTERP
+    |  lea RA, [BASE+RD*8]
+    |  neg RA
+    |  add RA, L:RB->top		// RA = (L->top-(L->base+nresults))*8
+    |  mov PC, [BASE-4]			// Fetch PC of caller.
+    |  jmp ->vm_returnc
+    break;
+
+  /* ---------------------------------------------------------------------- */
+
+  default:
+    fprintf(stderr, "Error: undefined opcode BC_%s\n", bc_names[op]);
+    exit(2);
+    break;
+  }
+}
+
+static int build_backend(BuildCtx *ctx)
+{
+  int op;
+  dasm_growpc(Dst, BC__MAX);
+  build_subroutines(ctx);
+  |.code_op
+  for (op = 0; op < BC__MAX; op++)
+    build_ins(ctx, (BCOp)op, op);
+  return BC__MAX;
+}
+
+/* Emit pseudo frame-info for all assembler functions. */
+static void emit_asm_debug(BuildCtx *ctx)
+{
+  int fcofs = (int)((uint8_t *)ctx->glob[GLOB_vm_ffi_call] - ctx->code);
+#if LJ_64
+#define SZPTR	"8"
+#define BSZPTR	"3"
+#define REG_SP	"0x7"
+#define REG_RA	"0x10"
+#else
+#define SZPTR	"4"
+#define BSZPTR	"2"
+#define REG_SP	"0x4"
+#define REG_RA	"0x8"
+#endif
+  switch (ctx->mode) {
+  case BUILD_elfasm:
+    fprintf(ctx->fp, "\t.section .debug_frame,\"\",@progbits\n");
+    fprintf(ctx->fp,
+	".Lframe0:\n"
+	"\t.long .LECIE0-.LSCIE0\n"
+	".LSCIE0:\n"
+	"\t.long 0xffffffff\n"
+	"\t.byte 0x1\n"
+	"\t.string \"\"\n"
+	"\t.uleb128 0x1\n"
+	"\t.sleb128 -" SZPTR "\n"
+	"\t.byte " REG_RA "\n"
+	"\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
+	"\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
+	"\t.align " SZPTR "\n"
+	".LECIE0:\n\n");
+    fprintf(ctx->fp,
+	".LSFDE0:\n"
+	"\t.long .LEFDE0-.LASFDE0\n"
+	".LASFDE0:\n"
+	"\t.long .Lframe0\n"
+#if LJ_64
+	"\t.quad .Lbegin\n"
+	"\t.quad %d\n"
+	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
+	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
+	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
+	"\t.byte 0x8f\n\t.uleb128 0x4\n"	/* offset r15 */
+	"\t.byte 0x8e\n\t.uleb128 0x5\n"	/* offset r14 */
+#if LJ_NO_UNWIND
+	"\t.byte 0x8d\n\t.uleb128 0x6\n"	/* offset r13 */
+	"\t.byte 0x8c\n\t.uleb128 0x7\n"	/* offset r12 */
+#endif
+#else
+	"\t.long .Lbegin\n"
+	"\t.long %d\n"
+	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
+	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
+	"\t.byte 0x87\n\t.uleb128 0x3\n"	/* offset edi */
+	"\t.byte 0x86\n\t.uleb128 0x4\n"	/* offset esi */
+	"\t.byte 0x83\n\t.uleb128 0x5\n"	/* offset ebx */
+#endif
+	"\t.align " SZPTR "\n"
+	".LEFDE0:\n\n", fcofs, CFRAME_SIZE);
+#if LJ_HASFFI
+    fprintf(ctx->fp,
+	".LSFDE1:\n"
+	"\t.long .LEFDE1-.LASFDE1\n"
+	".LASFDE1:\n"
+	"\t.long .Lframe0\n"
+#if LJ_64
+	"\t.quad lj_vm_ffi_call\n"
+	"\t.quad %d\n"
+	"\t.byte 0xe\n\t.uleb128 16\n"		/* def_cfa_offset */
+	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
+	"\t.byte 0xd\n\t.uleb128 0x6\n"		/* def_cfa_register rbp */
+	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
+#else
+	"\t.long lj_vm_ffi_call\n"
+	"\t.long %d\n"
+	"\t.byte 0xe\n\t.uleb128 8\n"		/* def_cfa_offset */
+	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
+	"\t.byte 0xd\n\t.uleb128 0x5\n"		/* def_cfa_register ebp */
+	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset ebx */
+#endif
+	"\t.align " SZPTR "\n"
+	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
+#endif
+#if !LJ_NO_UNWIND
+#if LJ_TARGET_SOLARIS
+#if LJ_64
+    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@unwind\n");
+#else
+    fprintf(ctx->fp, "\t.section .eh_frame,\"aw\",@progbits\n");
+#endif
+#else
+    fprintf(ctx->fp, "\t.section .eh_frame,\"a\",@progbits\n");
+#endif
+    fprintf(ctx->fp,
+	".Lframe1:\n"
+	"\t.long .LECIE1-.LSCIE1\n"
+	".LSCIE1:\n"
+	"\t.long 0\n"
+	"\t.byte 0x1\n"
+	"\t.string \"zPR\"\n"
+	"\t.uleb128 0x1\n"
+	"\t.sleb128 -" SZPTR "\n"
+	"\t.byte " REG_RA "\n"
+	"\t.uleb128 6\n"			/* augmentation length */
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+	"\t.long lj_err_unwind_dwarf-.\n"
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+	"\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
+	"\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
+	"\t.align " SZPTR "\n"
+	".LECIE1:\n\n");
+    fprintf(ctx->fp,
+	".LSFDE2:\n"
+	"\t.long .LEFDE2-.LASFDE2\n"
+	".LASFDE2:\n"
+	"\t.long .LASFDE2-.Lframe1\n"
+	"\t.long .Lbegin-.\n"
+	"\t.long %d\n"
+	"\t.uleb128 0\n"			/* augmentation length */
+	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
+#if LJ_64
+	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
+	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
+	"\t.byte 0x8f\n\t.uleb128 0x4\n"	/* offset r15 */
+	"\t.byte 0x8e\n\t.uleb128 0x5\n"	/* offset r14 */
+#else
+	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
+	"\t.byte 0x87\n\t.uleb128 0x3\n"	/* offset edi */
+	"\t.byte 0x86\n\t.uleb128 0x4\n"	/* offset esi */
+	"\t.byte 0x83\n\t.uleb128 0x5\n"	/* offset ebx */
+#endif
+	"\t.align " SZPTR "\n"
+	".LEFDE2:\n\n", fcofs, CFRAME_SIZE);
+#if LJ_HASFFI
+    fprintf(ctx->fp,
+	".Lframe2:\n"
+	"\t.long .LECIE2-.LSCIE2\n"
+	".LSCIE2:\n"
+	"\t.long 0\n"
+	"\t.byte 0x1\n"
+	"\t.string \"zR\"\n"
+	"\t.uleb128 0x1\n"
+	"\t.sleb128 -" SZPTR "\n"
+	"\t.byte " REG_RA "\n"
+	"\t.uleb128 1\n"			/* augmentation length */
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+	"\t.byte 0xc\n\t.uleb128 " REG_SP "\n\t.uleb128 " SZPTR "\n"
+	"\t.byte 0x80+" REG_RA "\n\t.uleb128 0x1\n"
+	"\t.align " SZPTR "\n"
+	".LECIE2:\n\n");
+    fprintf(ctx->fp,
+	".LSFDE3:\n"
+	"\t.long .LEFDE3-.LASFDE3\n"
+	".LASFDE3:\n"
+	"\t.long .LASFDE3-.Lframe2\n"
+	"\t.long lj_vm_ffi_call-.\n"
+	"\t.long %d\n"
+	"\t.uleb128 0\n"			/* augmentation length */
+#if LJ_64
+	"\t.byte 0xe\n\t.uleb128 16\n"		/* def_cfa_offset */
+	"\t.byte 0x86\n\t.uleb128 0x2\n"	/* offset rbp */
+	"\t.byte 0xd\n\t.uleb128 0x6\n"		/* def_cfa_register rbp */
+	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset rbx */
+#else
+	"\t.byte 0xe\n\t.uleb128 8\n"		/* def_cfa_offset */
+	"\t.byte 0x85\n\t.uleb128 0x2\n"	/* offset ebp */
+	"\t.byte 0xd\n\t.uleb128 0x5\n"		/* def_cfa_register ebp */
+	"\t.byte 0x83\n\t.uleb128 0x3\n"	/* offset ebx */
+#endif
+	"\t.align " SZPTR "\n"
+	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
+#endif
+#endif
+    break;
+#if !LJ_NO_UNWIND
+  /* Mental note: never let Apple design an assembler.
+  ** Or a linker. Or a plastic case. But I digress.
+  */
+  case BUILD_machasm: {
+#if LJ_HASFFI
+    int fcsize = 0;
+#endif
+    int i;
+    fprintf(ctx->fp, "\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support\n");
+    fprintf(ctx->fp,
+	"EH_frame1:\n"
+	"\t.set L$set$x,LECIEX-LSCIEX\n"
+	"\t.long L$set$x\n"
+	"LSCIEX:\n"
+	"\t.long 0\n"
+	"\t.byte 0x1\n"
+	"\t.ascii \"zPR\\0\"\n"
+	"\t.byte 0x1\n"
+	"\t.byte 128-" SZPTR "\n"
+	"\t.byte " REG_RA "\n"
+	"\t.byte 6\n"				/* augmentation length */
+	"\t.byte 0x9b\n"			/* indirect|pcrel|sdata4 */
+#if LJ_64
+	"\t.long _lj_err_unwind_dwarf+4@GOTPCREL\n"
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+	"\t.byte 0xc\n\t.byte " REG_SP "\n\t.byte " SZPTR "\n"
+#else
+	"\t.long L_lj_err_unwind_dwarf$non_lazy_ptr-.\n"
+	"\t.byte 0x1b\n"			/* pcrel|sdata4 */
+	"\t.byte 0xc\n\t.byte 0x5\n\t.byte 0x4\n"  /* esp=5 on 32 bit MACH-O. */
+#endif
+	"\t.byte 0x80+" REG_RA "\n\t.byte 0x1\n"
+	"\t.align " BSZPTR "\n"
+	"LECIEX:\n\n");
+    for (i = 0; i < ctx->nsym; i++) {
+      const char *name = ctx->sym[i].name;
+      int32_t size = ctx->sym[i+1].ofs - ctx->sym[i].ofs;
+      if (size == 0) continue;
+#if LJ_HASFFI
+      if (!strcmp(name, "_lj_vm_ffi_call")) { fcsize = size; continue; }
+#endif
+      fprintf(ctx->fp,
+	  "%s.eh:\n"
+	  "LSFDE%d:\n"
+	  "\t.set L$set$%d,LEFDE%d-LASFDE%d\n"
+	  "\t.long L$set$%d\n"
+	  "LASFDE%d:\n"
+	  "\t.long LASFDE%d-EH_frame1\n"
+	  "\t.long %s-.\n"
+	  "\t.long %d\n"
+	  "\t.byte 0\n"				/* augmentation length */
+	  "\t.byte 0xe\n\t.byte %d\n"		/* def_cfa_offset */
+#if LJ_64
+	  "\t.byte 0x86\n\t.byte 0x2\n"		/* offset rbp */
+	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset rbx */
+	  "\t.byte 0x8f\n\t.byte 0x4\n"		/* offset r15 */
+	  "\t.byte 0x8e\n\t.byte 0x5\n"		/* offset r14 */
+#else
+	  "\t.byte 0x84\n\t.byte 0x2\n"		/* offset ebp (4 for MACH-O)*/
+	  "\t.byte 0x87\n\t.byte 0x3\n"		/* offset edi */
+	  "\t.byte 0x86\n\t.byte 0x4\n"		/* offset esi */
+	  "\t.byte 0x83\n\t.byte 0x5\n"		/* offset ebx */
+#endif
+	  "\t.align " BSZPTR "\n"
+	  "LEFDE%d:\n\n",
+	  name, i, i, i, i, i, i, i, name, size, CFRAME_SIZE, i);
+    }
+#if LJ_HASFFI
+    if (fcsize) {
+      fprintf(ctx->fp,
+	  "EH_frame2:\n"
+	  "\t.set L$set$y,LECIEY-LSCIEY\n"
+	  "\t.long L$set$y\n"
+	  "LSCIEY:\n"
+	  "\t.long 0\n"
+	  "\t.byte 0x1\n"
+	  "\t.ascii \"zR\\0\"\n"
+	  "\t.byte 0x1\n"
+	  "\t.byte 128-" SZPTR "\n"
+	  "\t.byte " REG_RA "\n"
+	  "\t.byte 1\n"				/* augmentation length */
+#if LJ_64
+	  "\t.byte 0x1b\n"			/* pcrel|sdata4 */
+	  "\t.byte 0xc\n\t.byte " REG_SP "\n\t.byte " SZPTR "\n"
+#else
+	  "\t.byte 0x1b\n"			/* pcrel|sdata4 */
+	  "\t.byte 0xc\n\t.byte 0x5\n\t.byte 0x4\n"  /* esp=5 on 32 bit MACH. */
+#endif
+	  "\t.byte 0x80+" REG_RA "\n\t.byte 0x1\n"
+	  "\t.align " BSZPTR "\n"
+	  "LECIEY:\n\n");
+      fprintf(ctx->fp,
+	  "_lj_vm_ffi_call.eh:\n"
+	  "LSFDEY:\n"
+	  "\t.set L$set$yy,LEFDEY-LASFDEY\n"
+	  "\t.long L$set$yy\n"
+	  "LASFDEY:\n"
+	  "\t.long LASFDEY-EH_frame2\n"
+	  "\t.long _lj_vm_ffi_call-.\n"
+	  "\t.long %d\n"
+	  "\t.byte 0\n"				/* augmentation length */
+#if LJ_64
+	  "\t.byte 0xe\n\t.byte 16\n"		/* def_cfa_offset */
+	  "\t.byte 0x86\n\t.byte 0x2\n"		/* offset rbp */
+	  "\t.byte 0xd\n\t.byte 0x6\n"		/* def_cfa_register rbp */
+	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset rbx */
+#else
+	  "\t.byte 0xe\n\t.byte 8\n"		/* def_cfa_offset */
+	  "\t.byte 0x84\n\t.byte 0x2\n"		/* offset ebp (4 for MACH-O)*/
+	  "\t.byte 0xd\n\t.byte 0x4\n"		/* def_cfa_register ebp */
+	  "\t.byte 0x83\n\t.byte 0x3\n"		/* offset ebx */
+#endif
+	  "\t.align " BSZPTR "\n"
+	  "LEFDEY:\n\n", fcsize);
+    }
+#endif
+#if !LJ_64
+    fprintf(ctx->fp,
+      "\t.non_lazy_symbol_pointer\n"
+      "L_lj_err_unwind_dwarf$non_lazy_ptr:\n"
+      ".indirect_symbol _lj_err_unwind_dwarf\n"
+      ".long 0\n\n");
+    fprintf(ctx->fp, "\t.section __IMPORT,__jump_table,symbol_stubs,pure_instructions+self_modifying_code,5\n");
+    {
+      const char *const *xn;
+      for (xn = ctx->extnames; *xn; xn++)
+	if (strncmp(*xn, LABEL_PREFIX, sizeof(LABEL_PREFIX)-1))
+	  fprintf(ctx->fp, "L_%s$stub:\n\t.indirect_symbol _%s\n\t.ascii \"\\364\\364\\364\\364\\364\"\n", *xn, *xn);
+    }
+#endif
+    fprintf(ctx->fp, ".subsections_via_symbols\n");
+    }
+    break;
+#endif
+  default:  /* Difficult for other modes. */
+    break;
+  }
+}
+