// Michal Vlasák, FIT CTU, 2023
//
// An example of a practical use of DynASM, the dynamic assembler from the
// LuaJIT project by Mike Pall:
//
//         https://luajit.org/dynasm.html
//
// An indespensable resource for use of DynASM, except for the official sources
// is the unofficial documentation by Peter Cawley and includes a tutorial and
// a reference for the DynASM API and the x86/x86-64 instructions:
//
//         https://corsix.github.io/dynasm-doc/
//
// A great blog post introducing JITs and featuring DynASM has been written by
// Josh Haberman:
//
//         https://blog.reverberate.org/2012/12/hello-jit-world-joy-of-simple-jits.html
//
//
// This example demonstrates a template JIT compilation of a program for a very
// simple stack machine. The bytecode design, an example program and the
// original interpreter implemenation is due to Martin Dørum:
//
//         https://mort.coffee/home/fast-interpreters/

// Because of glibc versions before 2.37 by mistake didn't expose
// `MAP_ANONYMOUS` by default, and needed at least _BSD_SOURCE. But since 2.20,
// _BSD_SOURCE is deprecated and _DEFAULT_SOURCE should be used instead. Not
// ideal. See also feature_test_macros(7).
#define _BSD_SOURCE
#define _DEFAULT_SOURCE

// First, some generic includes.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>

// These defines are very not not portable. But the rest of our program depends
// on bigger details of the x86-64 architecture anyways, so assuming that int is
// 32 bit signed integer (which dynasm also assumes) is least of our concerns.
typedef unsigned char u8;
typedef int i32;
typedef unsigned int u32;

// We need mmap and mprotect (on POSIX systems) or VirtualAlloc and
// VirtualProtect (on Windows). See their later use in this file.
#if _WIN32
#include <windows.h>
#else
#include <sys/mman.h>
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

// For Macs we need to use MAP_JIT, we wrap it in `MAP_JIT_VALUE`, so we can be
// platform independent in the rest of the code. Also on Macs we have to
// explicitly invalidate the instruction cache for the processor to see the new
// executable code. This is a noop on x86-64 (otherwise we would also need to
// consider the issue on other operating systems), but let's hope that just this
// is enough to work on ARM64 based Macs with Rosetta 2.
#if defined(__APPLE__) && defined(__MACH__)
#define MAP_JIT_VALUE MAP_JIT
void sys_icache_invalidate(void *start, size_t len);
#else
#define MAP_JIT_VALUE 0
#endif

// Before we include dynasm includes, we need to (or rather want to) define a
// few macros that these headers use. Skip ahead until you need to / want to
// understand this. They allow us to customize the signatures and bodies of all
// dynasm functions. Dynasm will translate assembly snippets in our C source by
// replacing them with calls to `dasm_put`, with `Dst` as the first argument,
// and then some other arguments we don't care about now.
//
//         dasm_put(Dst, index, varargs...)
//
// Dynasm functions have signatures and bodies which look something like this:
//
//         void
//         dasm_X(Dst_DECL, ...)
//         {
//                 Dst_REF = ...;
//                 ...
//                 dasm_State *D = Dst_REF;
//                 ...
//                 dasm_Y(Dst);
//                 ...
//         }
//
// I.e. two more macros (`Dst_DECL` and `Dst_REF`) are used. The macro
// `Dst_DECL` will become part of the function signature and `Dst_REF` needs to
// be able to evaluate to an lvalue of `dasm_State *`, since dynasm function may
// assign to it.
//
// 1) Our dynasm state will be local to only one function, so a local variable
// holding the `dasm_State *` is sufficient there.
//
// 2) Because the dynasm functions need lvalue of `dasm_State *` we will need a
// level of indirection. For convenience we will have another local variable
// which will hold `dasm_State **`, i.e. a double pointer, whose dereference
// evaluates to an lvalue of `dasm_State *`.
//
// 3) Since dasm functions also use `Dst` internally, `Dst`, `Dst_DECL` and
// `Dst_REF` will necessarily have to use the same variable name. We pick the
// name `ds` to snad for "DynASM state".
//
// 4) To ensure consistent use, all our functions needing `dasm_State` will also
// use `Dst_DECL` in their signature and `Dst` to refer to the state, i.e. for
// example to call other dynasm functions which expect the same parameter. The
// canonical DynASM functions will then be able to extract `dasm_State *` and
// operate on that structure.
//
// This seemingly complicated scheme introduced by DynASM has a nice property -
// the state passed to DynASM functions is controlled by us. Other macros allow
// more customization of e.g. allocation in DynASM and by providing our state,
// we can customize how DynASM allocates memory. Also in real system, we would
// have more state than `dasm_State *` anyways, so our use could look like this
// instead:
//
//         typedef struct {
//                 Heap heap;
//                 Program program;
//                 dasm_State *ds;
//         } State;
//
//         #define Dst       state
//         #define Dst_DECL  State *state
//         #define Dst_REF   (sate->ds)
//
#define Dst       ds
#define Dst_DECL  dasm_State **ds
#define Dst_REF   (*ds)

// Also last chance before we include the DynASM headers is to optionally enable
// some checks from DynASM. We'll have them enabled unconditionally, in practice
// we wouldn't have them enabled in a (sufficiently tested) release build.
#define DASM_CHECKS

// Now we can finally include the DynASM headers. The first one contains just
// the declarations of functions we can use or fallback definitions of macros,
// some of which we chose to override above. This header file is generic and
// applies to all dynasm backends regardless of architecture.
#include "dasm_proto.h"

// The second header file we include is specific for our architecture. Since we
// aim the x86-64 we need to include that. While the preprocessor has a separate
// file for the 64 bit version of x86, there is only one header common to both.
#include "dasm_x86.h"

// The `.arch` line tells the dynasm.lua prepreprocessor to load dasm_x64.lua,
// which knows the details of the x64 (x86-64) architecture. For this reason it
// has has to one of the first thing the preprocessor sees. But, this directive
// will also translate directly to some output: a check that the version of the
// preprocessor (dynasm.lua) is consistent with the header file (dasm_proto.h).
// So it can't be before the includes above.
//|.arch x64

// The above directive translates to something like this:
//
//         #if DASM_VERSION != 10500
//         #error "Version mismatch between DynASM and included encoding engine"
//         #endif

// Dynasm allows us to write snippets of assembly after `|` ("pipe"), or in our
// version after `//|`. These snippets are preprocessed into calls to the
// `dasm_put` function which appends the code to an internal dasm buffer, which
// after several passess (done by functions which will be introduced in due
// time), we can extract and have an executable series of bytes, i.e. code
// generated at runtime. The biggest devises of dynasm are:
//
//  1) We write the assembly snippets directly into our C source, which makes it
//     quite readable.
//
//  2) We can mix the assembly with regular C expressions. These expressions are
//     written statically in the source (i.e. `4 * offset`), but their values
//     can be different at different times `dynasm_put` is called (that is, when
//     the C execution gets to our code between `//|`). This is nice when we
//     have things with static _shapes_, but that can take advantage of having
//     different constants being embedded in the assembly. For an example a
//     bytecode `OP_GET_LOCAL index` instruction that sets the local at index
//     `index` to the value at the top of the stack, probably has an index
//     following it in the instruction stream. From the perspective of the
//     interpreter, the index has to be loaded everytime any `OP_GET_LOCAL`
//     instruction is executed, since different `GET_LOCAL` instructions can
//     have different indices. But if we were to translate that instruction into
//     assembly, we are always translating a concrete `GET_LOCAL` with a
//     concrete index, so what we want to generate generally will be the same
//     code, but with a different constant.
//
//     Suppose that locals are 8 bytes and that base pointer to locals is in
//     the register rbx, and that we want to push the value to the stack:
//
//   if (op == OP_GET_LOCAL) {
//           int index = READ_INDEX(ip + 1);
//           | mov rax, [rbx + (index * 8)]
//           | push rax
//   }
//
//   // Here `(index * 8)` is a regular C expression which will become the
//   argument to `dasm_put`, i.e. `dasm_put(..., (index * 8))`, other arguments
//   to `dasm_put` need to somehow encode the rest of the snippet, i.e. the
//   instruction bytes:
//
//           mov REG, [REG + CONST]
//           push REG
//
//
//
// `.actionlist` results in static array of unsigned chars (i.e. bytes), which
// constitute a bytecode for the runtime portion of dynasm which performs the
// puts, links and encodes at runtime. Different `dasm_put` calls use different
// portions of this buffer. There can be only one `.actionlist` per file.
//
// The bytecode contains bytes that are translated literally into the output,
// these are bytes that directly correspond to encodings of instructions.
//
// The "encoding time constants", i.e. the C expressions are not put into the
// action list. First of all, these are C expressions, that dynasm doesn't know
// much about (it is a simple text based preprocessor), but also naturally these
// expressions evaluate to different values at runtime. Instead, the bytecode
// contains also dynasm instructions, which tell dynasm that for example, now it
// should take the next expression (passed to `dasm_put` as argument) or that it
// is now encoding a forward jump with a 4 byte offset, which should be reserved
// in the first pass, but resolved in a subsequent pass.
//
//|.actionlist our_dasm_actions
//
// The above directive translates to something like this:
//
//
//         static const unsigned char our_dasm_actions[85] = {
//                 83,85,72,137,229,72,187,237,237,255,249,204,255,104,237,255,
//                 89,88,72,1,200, 80,255,72,184,237,237,72,191,237,237,94,252,
//                 ...
//         };
//
//  Generally most bytes correspond to instruction bytes directly, while the
//  high bytes (233+ at the moment) are DASM instructions.
//

// Labels are a well known to anybody who has seen any assembly code. They allow
// us to refer to positions in the code (or data, ...) by human readable names,
// instead of the relative offsets or absolute positions the machine code needs.
// DynASM allows us to have a fixed set of global labels, and give us an array
// of pointers (`void *`) to them. So not only can we refer to these labels in
// DynASM snippets and DynASM will translate the use of those labels to
// probably relative offsets of jumps / calls, but we will also have absolute
// addresses of these labels and can e.g. call them directly, by casting these
// pointers to function pointers and calling them.
//
// For example we can define an "identity" function that just returns its first
// (and only argument). According to the AMD 64 System V ABI, the first
// arguments is passed in register rdi and the return value is supposed to be in
// the register rax, so our function marked with the global "MY_IDENTITY" label
// looks like this:
//
//         |->MY_IDENTITY:
//         |
//         |  mov rax, rdi
//         |  ret
//
// We can refer this function from any other snippet and DynASM will translate
// the relative call offset as appropriate:
//
//         | mov rdi, 5
//         | call ->MY_IDENTITY
//
// The situation with offsets is not that easy though (they are only 32 bit,
// even on x86-64 where the address space is 64 bit), but we'll come back to
// that later.
//
// Or after processing it, we can access take the address of the function
// (`void *`), cast it appropriately and call it. The addresses of all globals
// are stored in a single globals array, which we need to allocate ourselves.
// The global functions are referred to by the means of their symbolic names
// through a C enum, with a prefix. We set this prefix by use of the `.globals`
// dynasm preprocessor directive, which will translate to the enum with all
// global label names (with the prefix).
//
//         void *our_dasm_labels[DASM_LBL__MAX];
//         dasm_setupglobal(Dst, our_dasm_labels, DASM_LBL__MAX);
//         // processing the snippet with the global label, as well as linking
//         // and encoding of the code are omitted, but after that we could do:
//         long (*identity)(long argument) = (long (*)(long)) dasm_labels[DASM_LBL_MY_IDENTITY];
//         long five = identity(5);
//
// Above, `identity` is a pointer to function taking one argument of the type
// `long` and returning a `long`. This is the type of the identity function we
// defined above (since `long` is 64 bit on x86-64). We cast the `void *` from
// the globals array and store it in `identity` so that we can then call it.
//
// (In standard C casting between data pointers and function pointers is not
// allowed, and indeed it doesn't work on some architectures, but we are fine
// on x86-64.)
//
// Here we tell the dynasm preprocessor what prefix we want to use for the enum,
// and it will also translate to that enum:
//
//|.globals DASM_LBL_
//
// The above translates to something like:
//         enum {
//                 DASM_LBL_MY_IDENTITY,
//                 DASM_LBL__MAX
//         };

// DynASM support multiple sectinons, though they are all currently limited to
// executable code, and "data" or "bss" sections are not supported. We will do
// with just one section for "code" we declare it here, which causes the
// preprocessor to define a couple of section related macros -- one for each
// section to give it an index, and one for the total number of sections that
// will came handy later, when we call `dasm_init` which needs to know the
// maximal number of sections we want to use.
//|.section code
//
// The above translates to something like:
//
//         #define DASM_SECTION_CODE   0
//         #define DASM_MAXSECTION     1

// In these days of virtual memory, most of the memory we can allocate won't be
// executable. So we need to allocate our own pages of memory, where we will put
// our code and make them executable. However, security-wise, having memory
// which is both writable and executable is far from ideal, so it is better to
// allocate writable pages, put our code into them and then make them
// executable, but not writable. That way the pages are never both writable and
// executable at the same time (this is known as W^X and even strictly enforced
// by some operating systems).
//
// Apart from the checks the below function is mostly a copy of the one from
// Peter Cawley's DynASM tutorial, we just do more checking with asserts and try
// to support Macs.
//
// What the function does, is that it receives a dynasm state, with some
// snippets already put into it with `dasm_put` which is essentially a first
// pass over the code, where we are just pasting together snippets and DynASM is
// interpolating them with the evaluations of the used expressions. `dasm_link`
// is the second pass we need to do on the pending code. The call to the
// function essentially tells dynasm that here our assembly pasting ends. It
// will do a pass over the code determining the relative offsets of labels, etc.
// and most importantly it will already figure out the final length of the
// code and return it to use through an output parameter. We use that size to
// allocate enough writable pages from the operating system and tell DynASM to
// encode the code there. This will not only allow us to mark those pages
// executable and have the code in an executable area, but also allows DynASM to
// know the final destination of the code, so it can finalize relative offsets
// to e.g. global labels, which (in general) are not in the pending piece of
// code. After we have the code in our buffer, we ask the operating to change
// the protection of the pages there to make the area non-writable, but
// executable. Finally we return a pointer (`void *`) to the start of the
// buffer. This leaves our function generic and we don't have to limit our
// linking and encoding to a single function signature -- the caller can cast in
// any way they like.
static void *
our_dasm_link_and_encode(Dst_DECL)
{
	size_t size;
	void* code;
	assert(dasm_checkstep(Dst, 0) == DASM_S_OK);
	assert(dasm_link(Dst, &size) == DASM_S_OK);
#ifdef _WIN32
	code = VirtualAlloc(0, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
#else
	code = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT_VALUE, -1, 0);
#endif
	assert(dasm_encode(Dst, code) == DASM_S_OK);
#ifdef _WIN32
	DWORD original;
	VirtualProtect(code, size, PAGE_EXECUTE_READ, &original);
#else
	mprotect(code, size, PROT_READ | PROT_EXEC);
#endif
#if defined(__APPLE__) && defined(__MACH__)
	sys_icache_invalidate(code, size);
#endif
	return code;
}


// All instructions implicitly bump the instruction pointer by their length
// after they are executed, unless stated otherwise.
enum op {
	// Take 4 bytes from the instruction stream, interpret them as little
	// endian two's complement signed integer and push them on to the stack.
	OP_CONSTANT,

	// Pop `b` and `a` respectively from the stack, push `a + b` on to the
	// stack.
	OP_ADD,

	// Pop a value from top of the stack and print it.
	OP_PRINT,

	// Take a value from the input stream and advance the input position.
	OP_INPUT,

	// Pop a value from top of the stack and don't use it for anything.
	OP_DISCARD,

	// Take 4 bytes from the instruction stream, interpret them as little
	// endian two's complement signed integer to be used as an offset
	// relative to the top of the stack (i.e. offset 0 corresponds to the
	// 32 bit value on top of the stack, offset 1 corresponds to the 32 bit
	// value 1 below the top of the stack). Push the value found on that
	// offset (relative to the top of the stack) on top of the stack.
	OP_GET,

	// Take 4 bytes from the instruction stream, interpret them as little
	// endian two's complement signed integer to be used as an offset
	// relative to the top of the stack (i.e. offset 0 corresponds to the
	// 32 bit value on top of the stack, offset 1 corresponds to the 32 bit
	// value 1 below the top of the stack). Pop a value from top of the
	// stack. Assign the value to the slot at the offset (relative to the
	// top of the stack).
	OP_SET,

	// Pop `b` and `a` respectively from top of the stack. Compare them and
	// push a positive value to the top of the stack if `a` is bigger then
	// `b`, negative value if `b` is bigger than `a` and 0 if `a` is equal
	// to `b`.
	OP_CMP,

	// Take 4 bytes from the instruction stream, interpret them as little
	// endian two's complement signed integer to be used as an offset in the
	// instruction stream. Pop a value from top of the stack. If the value
	// is positive, then apply the offset (in bytes) to the current
	// instruction pointer. Note that the offset is relative to the start
	// of the `OP_JGT` instruction, i.e. the instruction pointer is only
	// bumped by the length of the instruction in case the conditional
	// jump isn't executed.
	OP_JGT,

	// Halt the execution of the program.
	OP_HALT,
};

static void *
compile(u8 *program, size_t program_len)
{
	// Here is the promised local variable holding the `dasm_State *`
	// itself. Though as defined with the macros above, we and other
	// functions will generally use it with the name `ds` and expect a
	// double pointer.
	dasm_State *dasm_state;
	dasm_State **ds = &dasm_state;

	// Each state has to be initialized with a call to `dasm_init`. As
	// promised, we use the macro `Dst`, instead of referring to `ds`
	// directly. This gives us more consistency with rest of the functions
	// and calls and is flexible. In any case the name `Dst` is referred to
	// by the `dasm_put` calls generated by the preprocessor. The second
	// argument tells DynASM the number of sections we will be using. We
	// only need 1, the section for all our code, but we also diligently used
	// the `.section` directive above and told DynASM what names we wanted
	// to give to our sections (we chose "code" for our single one) and thus
	// now have available the macro DASM_MAXSECTION generated by the
	// preprocessor.
	dasm_init(Dst, DASM_MAXSECTION);

	// We are not using global labels and in a simple template JIT likely
	// won't find the need, but we sill need to setup the globals, because
	// local labels can only be used when global labels are setup. And local
	// labels come handy pretty quickly (our snippets may need to contain
	// loops for example).
	void *our_dasm_labels[DASM_LBL__MAX];
	dasm_setupglobal(Dst, our_dasm_labels, DASM_LBL__MAX);

	// Now that we have our dynasm state initialized, we potentially want to
	// reuse it to assemble multiple pastings of templates and not just one.
	// Calling `dasm_init` and `dasm_free` everytime is an option, but we
	// can just reuse the same state. We just have to initialize each
	// "trace" with a call to `dasm_setup`, which we have to do even if we
	// want to process a single "trace" like we are doing in this example.
	// So here it is. We have to provide the byte array prepared by the
	// preprocessor. There can only be one `.actions` directive per file,
	// which means that all `dasm_put` calls in one file are based on that
	// single actions array (called `our_dasm_actions` in this case), so we
	// really don't want to pass anything other here. (One could
	// probably have different files with different `.actions`, but
	// "templates" from different files couldn't be mixed since DASM stores
	// the current actions in its internal state and it can't be changed in
	// any other way than `dasm_setup`, which resets the state.)
	dasm_setup(Dst, our_dasm_actions);

	// We want to translate jumps from the bytecode to jumps in the assembly
	// code and want dynasm to figure out the (relative) offsets as needed
	// by the instructions. For example our `OP_JGT` instruction has an
	// immediate operand which is the (byte) offset to apply in case a jump
	// should be taken. The machine code encodes similarly a relative offset
	// for a (conditional) jump, but it will be a different one than the one
	// in the bytecode. Since there is a potentially arbitrary number of
	// instructions in the bytecode we want to compile, we can't possibly
	// use local labels (1:, ..., 9:), because there are only 9 of them and
	// already want to be able to use them for local control flow (in one or
	// possible more templates pasted together) such as conditional
	// execution or loops. Global labels are also not suitable, because
	// these have symbolic names and we would have a hard time encoding
	// things like "instruction 5 needs to jump 8 bytes back, which is a
	// beginning of another instruction".
	//
	// DynASM has us covered and offers "pc labels", also called "dynamic
	// labels", which are just positive integers (internally used as
	// indices). These labels are dynamic, because the integers are
	// determined at the time of `dasm_put`, i.e. they can be arbitrary C
	// expressions that can evaluate to different values at different times
	// the snippet is pasted (also called "encoding-time constants"). So
	// again same shape (snippet), but with different values. We will be
	// using these labels in a really straightforward way - we'll want to
	// have a dynamic label for each instruction. When we encounter a jump
	// instruction, we can just point the jump to the dynamic label of the
	// destination. This works fine for backward as well as forward jumps,
	// as with all labels, DynASM is able to resolve relative offsets,
	// because it does multiple passes on the code. This is even simpler
	// than our compiler, which had to "fixup" forward jumps!
	//
	// Because the dynamic labels are indices to a DynASM internal array, we
	// need to "preallocate" the ones we'll need, and if we ever need more,
	// we'll have to allocate more. This is done with call to `dasm_growpc`.
	// After call to `dasm_growpc(Dst, N)`, dynamic labels 0 through at
	// least N - 1 will be available to us.
	//
	// Our bytecode is a compact serialization - instructions have different
	// lengths and jumps are based on byte offsets. We will go for a simple
	// route and instead of somehow numbering the instructions from 0 to
	// the number of instructions (and then figuring out the forward jump
	// destinations in a second pass), we'll just have a dynamic label for
	// each _byte_. Then even for forward jumps we are able to calculate
	// their byte offset in the instruction stream, where we will be put an
	// appropriate dynamic label later, and DynASM will resolve it in its
	// second pass. Having a label for each _byte_ is potentially
	// really wasteful as a lot of instructions have lengthy immediates, but
	// it's really simple. (The dynamic labels and the backing array are
	// also reused for multiple runs, if we don't `dasm_free` immediately,
	// but just `dasm_setup` before each run).
	dasm_growpc(Dst, program_len);

	// Now we have a fully initialized DynASM state for this round of
	// pasting together some assembly snippets. Remember that the lines with
	// assembly preceded with `//|` will be translated to calls to
	// `dasm_put(...)`.  So indeed what we are just doing is by controlling
	// the C execution we determine which assembly snippets we want to paste
	// together. With DynASM we can form arbitrary pieces of code, but we
	// somehow need to execute the code. We of course know how to execute
	// some other code in assembly - jump to it or call it. Both essentially
	// do a similar thing - through a relative offset or absolute address
	// stored in a register they change the instruction pointer (and call
	// additionally pushes to the stack the previous instruction pointer
	// which pointed to the _next_ instruction, so after we return from the
	// call, we can just resume the execution).

	// But we want to also somehow jump to our code from C. Unlike the
	// popular belief, the semantics of "goto statements" in C are not such
	// low level as a jumps in assembly, so we'll have to do with function
	// calls. All functions in C respect something that is called the ABI
	// of the platform, the "Application Binary Interface". The set of rules
	// for low level code, so that e.g. functions compiled with one compiler
	// can call functions compiled with another compiler. The ABI is usually
	// specific to both the current processor architecture (e.g. x86-64),
	// because the instruction set is really constraining us in what our
	// "binaries" may look like, as well as the operating system (e.g.
	// Linux), because not only we somehow need to communicate with the
	// operating system, but they historically evolved differently and e.g.
	// have different needs, so the ABI differs. Importantly for us, the ABI
	// defines sizes of C integer types (e.g. `long` is 64-bit on x86-64
	// Linux, but 32-bit on x86-64 Windows) as well as how parameters and
	// return values are passed. For simplicity we will care only about
	// the System V x86-64 ABI, used notably by Linux and other operating
	// systems.
	//
	//     https://gitlab.com/x86-psABIs/x86-64-ABI
	//     https://gitlab.com/x86-psABIs/x86-64-ABI/-/jobs/artifacts/master/raw/x86-64-ABI/abi.pdf?job=build
	//
	// Notably, Windows has a custom x86-64 ABI:
	//
	//     https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
	//
	// See also a blog post by Eli Bendersky:
	//
	//     https://eli.thegreenplace.net/2011/09/06/stack-frame-layout-on-x86-64

	// Here is the System V ABI 101 (imprecise, but enough to get going):
	//
	// First 6 arguments are passed in registers (left-to-right):
	//
	//     1. rdi
	//     2. rsi
	//     3. rdx
	//     4. rcx
	//     5. r8
	//     6. r9
	//
	// The return value is expected to be in the register rax. If there are
	// more than 6 arguments they pushed on to the stack right-to-left. Even
	// arguments that are smaller than 64 bits use the full registers to be
	// passed. Small-ish structs are passed as if all the struct members
	// were passed as arguments separately.
	//
	// These registers are _callee_ saved (i.!e. the function that called us
	// expects the values in these registers to be preserved, so we, the
	// callee, need to preserve the registers by not using them or restoring
	// them to their original values):
	//
	//     rbp ("base pointer")
	//     rsp ("stack pointer")
	//     rbx
	//     r12
	//     r13
	//     r14
	//     r15
	//
	// These registers are _caller_ saved (i.e. we as the callee can use
	// them freely, since if the caller needs the their values they need to
	// save the values anyway, on the other hand if we call some other
	// function, we are the caller and the function can possibly change any
	// of the registers, so we have to save the values if we need them):
	//
	//     rdi
	//     rsi
	//     rdx
	//     rcx
	//     r8
	//     r9
	//
	//     rax
	//     r10
	//     r11
	//
	// Notably all registers used for parameter passing are caller saved.
	// Obviously the return value register is also caller saved, but note
	// that the callee is free to change it also in the case there is no
	// return value ("the functions returns void").

	// Now is a good time to decide on how and where to store the data we
	// need. We won't need the program at runtime, since it will be
	// hardcoded into the code (including the immediates, etc.), but we will
	// need the stack (since we will just translate the bytecode for stack
	// machine into assembly, not change its workings substantially) and
	// also a pointer to the current input position.
	//
	// The pointer to inputs will be passed to us as an argument - this is
	// the runtime data that our program will process. The `OP_INPUT`
	// instructions take one 32 bit number from the input and advance the
	// input pointer (by 4 bytes). We assume that the caller provided
	// sufficient number of inputs for the program at hand, so we don't
	// necessarily need the length of the input nor check it at runtime.
	// Since it is our only argument it will be in rdi, but that is a caller
	// saved register and when we later execute `OP_PRINT`, we are the
	// caller, so we need to save the register. We could either `push` and
	// `pop` rdi around each call, or move the argument to a callee saved
	// register, which we can save once in the entry to the function and
	// restore once at the end of the function. This makes for a much
	// simpler book keeping, though as always, if we save multiple callee
	// saved registers with `push`, we have to restore them with `pop` in
	// the _reverse_ order.
	//
	// For stack, we'll just use the x86-64 (call) stack. `OP_PUSH` will
	// translate directly to `push` and `OP_POP` directly to `pop`. These
	// instructions change the value of `rsp`, which though a "general
	// purpose register" is usually used as the stack pointer exactly
	// because some instructions can manipulate it compactly (`call` is
	// another example of an instruction that implicitly changes the stack
	// pointer by pushing the return address). So for us `rsp` will
	// represent pointer to the top of the stack, and we will have to
	// embrace its constraints. In particular `rsp` points to the position
	// where the current top of the stack _is_, not one past it like other
	// schemes do, and the stack grows downwards. So top of the stack is at
	// offset zero from top of the stack, and if we want to push a value to
	// the stack, we first need to decrement the stack pointer and then put
	// the value at the stack pointer position, while to pop we need to
	// read the value from the current stack pointer and then increment it.
	// I.e. pre-decrement push and post-increment pop (we would get another
	// combination if the stack pointer pointed one past the top of the
	// stack, or the stack grew upwards). While we have to keep that in
	// mind, this is what the `push` / `pop` instructions will do for us,
	// and we won't have to do it manually). See also another blog post by
	// Eli Bendersky, which has illustrations:
	//
	//     https://eli.thegreenplace.net/2011/02/04/where-the-top-of-the-stack-is-on-x86/
	//
	// (Note that the blog post concerns x86 whose stack operations are 32
	// bit, while on x86-64 stack operations are 64 bit, e.g. push
	// instructions decrement the stack by 8).

	// Here comes the entry to our function, the "prologue". Here we usually
	// want to save all the callee saved registers that we intend to use.
	// Usually prologue starts with "push rbp" and "mov rbp, rsp", which
	// establish the "stack frame" (also called "call frame"), see the
	// linked blog posts by Eli Bendersky for details. Instead we start by
	// pushing rbx. We will use this callee save register to store the input
	// pointer -- while the caller passes us the input pointer as the first
	// argument in rdi, it would get clobbered once we call other function
	// ourselves. Storing it in a callee saved register and
	// storing/restoring the register on entry/exit keeps the management
	// simple. Callee The thing is that the execution of the program may
	// leave some values on the stack. By resetting the rbp we could get
	// easily rid of them, though it would need for us to address
	// relatively to the rsp to restore rbx if it were pushed after rbp.
	// So we simply push it before (and this restore it with pop after we
	// restore the base pointer).

	//| push rbx
	//| push rbp
	//| mov rbp, rsp
	//| mov rbx, rdi

	// Now we will go through all instructions and translate them one by
	// one. For each instruction we have a snippet of assembly, that will be
	// interpolated with value computed from embedded C expressions
	// ("encoding-time constants").
	u8 *instrptr = program;
	u8 *end = program + program_len;

	// This macro is a helper, which reads a 4 byte immediate operand from
	// a position one byte behind the instruction pointer (which skips the
	// opcode).
	#define OPERAND() ((i32) ( \
		((u32)instrptr[1] << 0) | \
		((u32)instrptr[2] << 8) | \
		((u32)instrptr[3] << 16) | \
		((u32)instrptr[4] << 24)))

	while (instrptr < end) {

		// Read the current opcode, which distinguishes the current
		// instruction.
		enum op op = (enum op)*instrptr;

		// Put here a label corresponding to this instructions byte
		// offset. Any preceding or following jumps to this instruction
		// will find this label. Note that we cast the offset to an
		// `int`. This is because DynASM expects all arguments to
		// `dasm_put` to `int`s. As mentioned above, in this ABI `int`
		// is 32 bit, so all values passed to DynASM are limited to the
		// 32 bit range. This is mostly OK, since in the x86-64 most
		// immediates encoded in the instruction stream are also limited
		// to 4 bytes. This 32 bit limitation is thus a limitation of
		// our target architecture, not necessarily of DynASM, and we
		// have to somehow deal with that.
		//
		// Note that in this case, it's unlikely that we would have
		// programs larger than 4 gigabytes, i.e. offset would not fit
		// into 32 bits. But even shorter programs could produce x86-64
		// executable code larger than 4 gigabytes, and there 32 bit
		// relative offsets for jumps and calls would also not be
		// enough.
		//
		// Also at the start of the instruction, there is a (commented
		// out) `int3` instruction. Comment with `//!` (i.e. two forward
		// slashes and exclamation mark) is not a special syntax
		// understood by DynASM, just something different than `//|`,
		// but still a C comment. The `int3` instruction is a "trap to
		// the debugger". So if you were to uncomment it and run this
		// program in debugger, the debugger would stop before each
		// executed (jitted) bytecode instruction. This can be very
		// useful for debugging, also if paired with a print of some
		// debug information before/after each trap.

		int offset = (int) (instrptr - program);
		//|=> offset:
		//! int3

		// The above  will be translated to roughly:
		//
		//         dasm_put(..., offset);
		//
		// We could have also said:
		//
		//         |=> (int) (instrptr - program)
		//
		// Which would translate to:
		//
		//         dasm_put(..., (int) (instrptr - program));
		//
		// There is nothing much more magic about the embedded C
		// expressions - DynASM handles them textually, and they are
		// evaluated at runtime like ordinary C expressions and passed
		// as arguments to `dasm_put`. Note that officially the
		// evaluation order of arguments is undefined in C, so it might
		// a good idea to explicitly evaluate the expressions before
		// using them in assembly snippets, if they can have side
		// effects, and even if they don't.

		switch (op) {
		case OP_CONSTANT: {
			// If the next instruction is `OP_CONSTANT` we need to
			// append the code for it, we do it by having an
			// assembly snippet here, which translates to
			// `dasm_put`. The instruction has an immediate operand
			// which we read with a macro and store it in a variable
			// before passing it to the snippet. While the
			// `OP_CONSTANT` instruction is generic and an
			// interpreter has to load the constant from bytecode
			// everytime, this particular `OP_CONSTANT` will always
			// have this one same operand, so we can just
			// embed it in the code. The below translates to `push 4`
			// if the operand in the instruction stream is
			// the number 4. This will be pattern matched by DynASM
			// to be a push of 32 bit immediate (see the unofficial
			// DynASM instruction reference or other x86-64
			// instruction references). This is perfect for us,
			// since we want to push a 32 bit value. If we wanted
			// to push 64 bit value, we would first have to somehow
			// get the value to a register (see later) and then use
			// `push REG`, because there just isn't an instruction
			// that is able to push 64 bit immediates. DynASM won't
			// do that for us automatically. What's worse the
			// argument (passed as vararg) is expected to be an int,
			// so passing larger values will only result in silent
			// truncation. Beware!

			i32 operand = OPERAND();
			//| push operand

			// This instruction is 5 bytes long, we have to advance
			// the instruction pointer by 5 bytes and switch on the
			// opcode to decide on what to compile next.

			instrptr += 5; break;
		}
		case OP_ADD: {
			// Pop the second argument, pop the first argument, add
			// the second to the first and push their result. The
			// operations here are 64 bit. Since stacks work with 64
			// bit values, the `pop`s and `push`es have to be 64 bit,
			// but the arithmetic doesn't -- we could just as well
			// have written `add eax, ecx` to operate on 32 bit
			// parts of the registers (which will zero out the
			// upper 32 bits). It would have save us a byte in the
			// encoding of the instructions and be slightly faster
			// in runtime (adding 32 bits is faster than adding 64),
			// but we don't care about such small improvements.

			//| pop rcx
			//| pop rax
			//| add rax, rcx
			//| push rax

			instrptr += 1; break;
		}
		case OP_PRINT: {
			// Printing is a little tricky. Barring system calls, we
			// have to call an external function to do the print.
			// This could be either our own function, or
			// even one from external library, like `printf`.
			// There is one problem though normal calls are based on
			// signed 32 bit relative offsets, while the x86-64
			// space is 64 bit, so call targets can be much further
			// than +-2 GiB. Here we find ourselves at runtime,
			// where both the static and dynamic loader/linker
			// have already figured out where everything in the
			// memory is, after all, our program is already
			// running. So for example the (64 bit) address of the
			// `printf` function is already known. But since we have
			// not yet mmaped pages for our code, we don't know
			// whether it will be close to `printf`!. DynASM has the
			// capability to handle external names through a
			// callback which will let us handle that in the
			// encoding stage where we already know the
			// destination of our code, but let's not go that far
			// right now. What we can do is use an _indirect_
			// through a register - we will store the 64 bit address
			// of `printf` in a register, and then call it
			// with `call REG`. Most instructions don't allow 64 bit
			// immediate operands, but one does, DynASM calls it
			// fittingly `mov64`, but other assemblers call it
			// `movabs`. It can store a 64 bit immediate in a
			// register. Perfect for us. Though there is another
			// problem, since the parameters to `dasm_put` are all
			// 32 bit integers (`int`), how are we going to pass it
			// to DynASM? Well, DynASM preprocessor knows that the
			// instruction operand is special and handles it by
			// passing the lower 32 bits and upper 32 bits
			// separately. Since the argument to `dasm_put` is
			// determined by a simple text substitution, and here
			// the argument will be evaluated twice, we should be
			// even more careful about complex expressions and side
			// effects.
			//
			// To printf, we also have to pass the format string as
			// a second argument, we also take its address and move
			// it into a register as 64 bit immediate. The second
			// argument should be the number to print -- we pop that
			// from the stack into a register. Finally we call the
			// function with an indirect call instruction.
			// Note that we cast the addresses to integers to avoid
			// compiler warnings (because dynasm follows it with
			// truncation to 32 bits or extraction of upper 32 bits
			// respectively.
			//
			// It is important to note, that printf respects the
			// ABI, so we have to presume it destroys the
			// values in caller saved registers, we only store our
			// state in rbx, so we our fine.

			//| mov64 rax, ((uintptr_t) printf)
			//| mov64 rdi, ((uintptr_t) "%zd\n")
			//| pop rsi
			//| call rax

			// The above is translated to roughly the following:
			//
			//     dasm_put(...,
			//         (unsigned int)(((uintptr_t) printf)),
			//         (unsigned int)((((uintptr_t) printf))>>32),
			//         (unsigned int)(((uintptr_t) "%zd\n")),
			//         (unsigned int)((((uintptr_t) "%zd\n"))>>32)
			//     )

			instrptr += 1; break;
		}
		case OP_INPUT: {
			// We have to read 32 bits from the place where rbx (the
			// input pointer) points to and advance it by
			// those 32 bits so next time we read the next input
			// value.
			//
			// For reading a 32 bit value (aka dword) from memory,
			// we could just issue a read into some 32 bit
			// register, like `eax`, i.e. `mov eax, [rbx]`, which
			// says "read a 32 bit value from a 64 bit address
			// stored in the rbx register. The size of the read is
			// determined implicitly from the (in this case
			// destination) register size. In cases involving
			// immediates, we usually have to specify whether we are
			// storing an 8/16/32/64 bit immediate by including the
			// right keyword for the size (`byte`, `word`, `dword`,
			// `qword` respectively). But we can also include these
			// keywords when they are not necessary, if we prefer
			// that, as we do below by specifying "dword read", even
			// though it would have been implicit from the use of
			// `eax` as the destination.

			//| mov eax, dword [rbx]
			//| push rax
			//| add rbx, 4

			instrptr += 1; break;
		}
		case OP_DISCARD: {
			// We just `pop` into a register and don't use the
			// value. That is what `OP_DISCARD` really is. Okay,
			// since we don't do anything interesting with the
			// value, we could just increment the stack pointer with
			// `add rsp, 8`, which would have saved a load to `rax`,
			// but since we don't care about preserving `rax`, and
			// use `push` and `pop` instructions in other places, so
			// we do it for consistency.

			//| pop rax

			instrptr += 1; break;
		}
		case OP_GET: {
			// For `OP_GET` we have a 32 bit immediate operand which
			// we will have to use as an offset to the top of the
			// stack. The offset is in number of elements, and in
			// our case each elements is 8 bytes. Since we load the
			// both the constant 8 and the operand as well as the
			// result of their multiplication are runtime known
			// values, but constants from the point of view of the
			// instruction - the multiplication is performed now and
			// the instruction will only contain the offset
			// as an immediate, for example if operand is 5, the
			// instruction will have the immediate value 40 encoded
			// in it. The way we do that below is slightly risky, we
			// put the multiplication and read of the operand in the
			// assembly snippet. The syntax may seem similar to the
			// well known complex addressing modes of x86-64, where
			// for example `[rsp + 8 * rax]` would mean of the
			// possible addressings. The trick here is that DynASM
			// expects to see `[rsp + rax * 8]`, so it isn't
			// confused by the leading `8 *`. Since `8 * OPERAND()`
			// is an encoding time constant, an entirely different
			// addressing mode will be used `[REG + IMMEDIATE]`.
			// Evaluating the "constant" before the assembly snippet
			// and storing it in a variable would have probably made
			// it much clearer.

			//| mov rax, [rsp + 8 * OPERAND()]
			//| push rax

			// The above translates roughly to:
			//
			//     dasm_put(..., 8 * OPERAND());

			instrptr += 5; break;
		}
		case OP_SET: {
			// There isn't anything interesting to `OP_SET`, we
			// pop a value from top of the stack and then store it
			// to an offset from (the new) top of the stack.

			//| pop rax
			//| mov [rsp + 8 * OPERAND()], rax

			instrptr += 5; break;
		}
		case OP_CMP: {
			// Here we need to compare two numbers and push a
			// positive number / negative number / zero as
			// appropriate. Let's not use setcc, and instead
			// showcase some local control flow using local labels.
			// Remember, the "arrows" (`<`, `>`) of local labels
			// point in the direction where the label is defined,
			// here we refer to forward labels only. The labels are
			// also purely based on the order of the instructions,
			// i.e. the order dasm receives them in, lexical order
			// in the C source doesn't matter at all.

			//| pop rcx
			//| pop rax
			//| cmp rax, rcx
			//| jg >1
			//| je >2
			//| push -1
			//| jmp >3
			//|1:
			//| push 1
			//| jmp >3
			//|2:
			//| push 0
			//|3:
			instrptr += 1; break;
		}
		case OP_JGT:
			// Conditional branch based on whether the top of the
			// stack is a positive number. We set the flags based on
			// the top of the stack by popping it into a register
			// and then testing the register with itself. This
			// effectively compares the register to zero, so we can
			// use the "greater than" conditional jump. The relative
			// byte offset to the destination is encoded in
			// the instruction stream, so we figure out the
			// (absolute) byte offset from the beginning, which also
			// is the index of dynamic label we either already put
			// there, or will later.

			int offset = (int) (instrptr - program + OPERAND());
			//| pop rax
			//| test rax, rax
			//| jg => offset

			instrptr += 5; break;
		case OP_HALT:
			// When the code compiled by us encounters this
			// instruction, it should halt the execution, since we
			// structure the code as a function, here is the right
			// place for return -- an epilogue including a `ret`
			// instruction.
			//
			// In our scheme we decided to first save rbx and then
			// rbp/rsp, so here we do the reverse - restore the base
			// and stack pointers and then restore rbx to the
			// caller's value.

			//| mov rsp, rbp
			//| pop rbp
			//| pop rbx
			//| ret

			instrptr += 1; break;
		}
	}

	// We `dasm_put` all snippets. Now we need to link and encode them. See
	// the description of the function for more details.
	void *code = our_dasm_link_and_encode(Dst);

	// Here we could keep the same DASM state, call `dasm_setup` and
	// continue with compilation of other programs, we just needed this
	// single one. (Also in the case we wanted to compile more programs, we
	// would probably split the dasm state initialization and
	// deinitialization into separate functions called just once, which
	// would have made it possible to use the same state with the `compile`
	// function multiple times.
	dasm_free(Dst);
	return code;
}

int
main(int argc, char **argv)
{
	// Here we have a program to multiply two numbers, the entire program,
	// just like the bytecode is due to Martin Dørum, be sure to check his
	// blog post for details:
	//
	//         https://mort.coffee/home/fast-interpreters/

	u8 program[] = {
		OP_INPUT, OP_INPUT,
		OP_CONSTANT, 0, 0, 0, 0,

		OP_GET, 0, 0, 0, 0,
		OP_GET, 3, 0, 0, 0,
		OP_ADD,
		OP_SET, 0, 0, 0, 0,

		OP_GET, 1, 0, 0, 0,
		OP_CONSTANT, 0xff, 0xff, 0xff, 0xff, // -1 32-bit little-endian (two's complement)
		OP_ADD,
		OP_SET, 1, 0, 0, 0,

		OP_GET, 1, 0, 0, 0,
		OP_CONSTANT, 0, 0, 0, 0,
		OP_CMP,
		OP_JGT, 0xd5, 0xff, 0xff, 0xff, // -43 in 32-bit little-endian (two's complement)

		OP_GET, 0, 0, 0, 0,
		OP_PRINT,

		OP_HALT,
	};

	// The input for us are just two command line arguments.
	if (argc != 3) {
		fprintf(stderr, "Expected exactly 2 arguments\n");
		return 1;
	}
	i32 input[] = { atoi(argv[1]), atoi(argv[2]) };

	// Compile the program by calling the compile function with the program
	// and its size.
	void (*fun)(i32 *input) = compile(program, sizeof(program));

	// And run the function passing it pointer to the input array. We don't
	// pass the length, the compiled program doesn't need it -- it expects
	// the input array to be of sufficient length.
	fun(input);

	return 0;
}