12 files changed, 508 insertions, 67 deletions
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..0221931
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,218 @@
+╭──────────────────────────────────────────────────────────────────────────────╮
+│ imb3: A programming language that combines the best parts of Common Lisp and │
+│ Forth to make a very high-level language with easy metaprogramming.          │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ The "guts" of the language resemble Common Lisp (e.g. a CLOS-like object     │
+│ system, a condition system, a high-performance garbage collector), while the │
+│ surface syntax is all Forth. There is also a type checker for a type system  │
+│ that allows dynamically-typed code to exist while also allowing fairly       │
+│ complex properties to be checked at compile-time if desired. The cherry on   │
+│ top is an efficient native compiler with the REPL interface users of both    │
+│ Common Lisp and Forth have come to expect.                                   │
+├──────────────────────────────────────────────────────────────────────────────┤
+│ This initial implementation is designed to be recompilable for a variety of  │
+│ platforms, including the Nintendo 3DS.                                       │
+╰──────────────────────────────────────────────────────────────────────────────╯
+
+╭──────────────────────────────────────────────────────────────────────────────╮
+│ Garbage Collector                                                            │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Currently, the garbage collector is a generational garbage collector with    │
+│ three generations -- the “nursery,” the “young-heap,” and the “old-heap.”    │
+│ Each one uses a different garbage collection algorithm.                      │
+│                                                                              │
+│ The nursery is the region almost all new objects are allocated into. There   │
+│ are two exceptions:                                                          │
+│                                                                              │
+│   1. Objects that are too large to fit in the nursery.                       │
+│   2. Objects with a finalizer. Note that finalizers are not user-exposed, to │
+│      avoid issues with object resurrection -- they are currently only used   │
+│      to allow allocating executable code, using a custom allocator with a    │
+│      malloc/free -style interface.                                           │
+│                                                                              │
+│ These objects are instead allocated into the young-heap if they could fit    │
+│ there, and the old-heap if not.                                              │
+│                                                                              │
+│ The nursery’s size is chosen when the process starts, and is equal to the    │
+│ size of the largest L1d cache. When the nursery fills up, Cheney’s algorithm │
+│ is run to copy any live objects into the young-heap. Because Cheney’s        │
+│ algorithm does not traverse dead objects, objects with finalizers must be    │
+│ allocated in (and, more importantly, freed from) another region.             │
+│                                                                              │
+│ As the data in the nursery grows up, the addresses  ┌──────────────┐ 0x…8000 │
+│ of slots that have had pointers written into them   │remembered set│         │
+│ are written to the top of its region, and the end   ├──────────────┤ 0x…7000 │
+│ of the allocatable region grows down. (This is the  │     free     │         │
+│ only write barrier the collector needs. Notably,    │     space    │         │
+│ this avoids branches other than “are we out of      ├──────────────┤ 0x…5000 │
+│ space for the remembered set (in which case we need │              │         │
+│ to perform a nursery GC).”)                         │              │         │
+│                                                     │    objects   │         │
+│ When a nursery GC occurs, its remembered set is     │              │         │
+│ checked for references into the nursery. These are  │              │         │
+│ used as roots. References to other regions are      └──────────────┘ 0x…0000 │
+│ copied into the remembered set of the young-heap,      The Nursery Region’s  │
+│ and non-references are discarded. This discarding            Layout          │
+│ behavior removes another branch -- whenever a value                          │
+│ not statically known to be an integer is written,                            │
+│ we can unconditionally add the place it was written to into the remembered   │
+│ set.                                                                         │
+│                                                                              │
+│ The young-heap is a region that is larger than the nursery, but small enough │
+│ that collections should still be fast. It is sized to be a fixed multiple of │
+│ the amount of L3 cache in the system (currently, 2). It is collected with a  │
+│ mark-compact algorithm based on the Lisp 2 algorithm, which makes finalizers │
+│ essentially free. To ensure that there is always room to complete a nursery  │
+│ GC, we reserve a portion of the young-heap equal in size to the nursery, and │
+│ collect the young-heap once there is not enough free memory to allocate      │
+│ without cutting into that reservation.                                       │
+│                                    ╭──────╥────────────────────────────────╮ │
+│ Objects in the young-heap remain   │ TODO ║ Is there a selection algorithm │ │
+│ there for a tunable number of GC   ╞══════╝ we could use to make a dynamic │ │
+│ cycles (currently defaulting to 3) │ choice for how long objects remain in │ │
+│ before being tenured -- that is,   │ the young-heap before being tenured?  │ │
+│ moved to the old-heap.             ╰───────────────────────────────────────╯ │
+│                                                                              │
+│ The young-heap has its own remembered set, whose entries are copied in from  │
+│ the nursery’s remembered set. To ensure that these entries are up to date,   │
+│ a young-heap GC can only occur when the nursery is empty. After a young-heap │
+│ GC, we do not need to copy the remembered set into the old-heap, because all │
+│ remaining references will stay internal to the old-heap.                     │
+│                                                                              │
+│ Finally, the old-heap stores objects that have outlived their time in the    │
+│ other regions. It is organized into several blocks, each of which belongs to │
+│ a size class. New blocks are allocated and freed as needed, so the old-heap  │
+│ dynamically grows and shrinks. Old-heap collections occur when the old-heap  │
+│ is using twice as much memory as it was the last time a collection completed │
+│ (capped by a memory limit, by default the size of RAM on the entire system). │
+│                                                                              │
+│ The old-heap is collected using mark-sweep with bitmap marking, including a  │
+│ per-block bit which allows entire blocks to be swept at once if they contain │
+│ only dead objects without finalizers. (A separate bit is maintained when     │
+│ objects are allocated and when sweeping, to track whether a block contains   │
+│ objects with finalizers or not.)                                             │
+│                                                                              │
+│ As the young-heap requires the nursery to be empty before it is collected,   │
+│ so too does the old-heap require both the nursery and young-heap to be empty │
+│ before it can be collected. Because collecting the old-heap tends to be so   │
+│ slow relative to the nursery and young-heap, this does not add significant   │
+│ overhead.                                                                    │
+│                                                                              │
+│ If 5 consecutive old-heap collections recovers less than 2% of heap memory   │
+│ needed while the heap is at its maximum size, the process exits with an      │
+│ error. This behavior and these constants are taken from OpenJDK, and are an  │
+│ attempt to avoid GC thrashing.                                               │
+├──────────────────────────────────────────────────────────────────────────────┤
+│ Thread-based parallelism is not yet implemented. However, parallelizing this │
+│ collector design is relatively straightforward. Each thread should be pinned │
+│ to a hardware core. Threads can then get a per-thread nursery sized to their │
+│ core’s L1d cache, rather than a system-wide maximum.                         │
+│                                                                              │
+│ The young-heap and old-heap are each protected with a mutex, which prevents  │
+│ multiple calls to allocate memory from interfering with one another.         │
+│ Unfortunately, this also means that nursery collections cannot occur         │
+│ concurrently. It is an open question whether this is an issue in practice as │
+│ mutators and nursery collections execute, but it becomes an issue during     │
+│ young-heap collections.                                                      │
+│                                                                              │
+│ To mitigate this issue, starting a young-heap collection does not signal the │
+│ other threads immediately. Instead, each thread is responsible for notifying │
+│ the next thread just before it completes collection, unless another thread   │
+│ is already waiting to collect. This maximizes the amount of time threads     │
+│ spend mutating instead of waiting to perform a nursery collection.           │
+│                                                                              │
+│ Once all the nursery collections have completed in serial, the marking phase │
+│ can be performed in parallel with work-stealing. Of the Lisp 2 algorithm’s   │
+│ passes, only the “update object fields to refer to the new location” pass is │
+│ obviously parallelizable. Finalizers are also called in this pass, to allow  │
+│ them to be executed in parallel as well. (Recall that finalizers are always  │
+│ provided by the runtime, so they can be trusted to be thread-safe.)          │
+│                                                                              │
+│ Finally, the old-heap can easily be both marked and swept in parallel. Each  │
+│ block is conceptually owned by a thread, and that thread is responsible for  │
+│ sweeping it.                                                                 │
+├──────────────────────────────────────────────────────────────────────────────┤
+│ There are some experiments it’d be nice to run with this collector:          │
+│                                                                              │
+│   1. Right now, the write barrier design is intended to avoid branches as    │
+│      much as possible (without relying on page faults). Instead, extra       │
+│      nursery collections will be performed due to the remembered set being   │
+│      larger than it needs to be.                                             │
+│                                                                              │
+│      i. Are the branches really that expensive? In particular, does it end   │
+│         up being a net win to add a branch on:                               │
+│                                                                              │
+│         a. Whether the written value is a fixnum at runtime or not.          │
+│                                                                              │
+│         b. Whether the slot being written to is in a non-nursery object.     │
+│                                                                              │
+│         c. Whether the value being written is a pointer to a non-old-heap    │
+│            object.                                                           │
+│                                                                              │
+│         d. Whether the value being written is a pointer to an object in a    │
+│            strictly younger region than the slot it’s being written to.      │
+│                                                                              │
+│      ii. Is the branch on whether the nursery is full really that cheap?     │
+│          Does it end up being a net win to use a guard page to store the     │
+│          nursery separately from the remembered set, and use guard pages to  │
+│          fault when each one is full?                                        │
+│                                                                              │
+│   2. The constants listed above were chosen by a combination of “that sounds │
+│      about right,” “Go does that,” and “OpenJDK does that.” It’s plausible   │
+│      that instead, good defaults could be learned with a genetic algorithm   │
+│      that tunes the constants in a much more general formula, e.g.           │
+│                                                                              │
+│        nursery size = k₁ + k₂ × L1d size + k₃ × L2 size + k₄ × L3 size + …   │
+│                                                                              │
+│      It would be useful to try learning good defaults on a wide variety of   │
+│      machines and workloads, and seeing if natural constants “fall out.”     │
+│      (Note that this means running a single learning process whose loss      │
+│      function depends on the performance of the formula across all the       │
+│      machines.)                                                              │
+│                                                                              │
+│      This could also be used alongside any “boolean-valued” experiments, to  │
+│      learn whether they’re useful and how they affect the constants.         │
+│                                                                              │
+│      Finally, this could plausibly even be a useful tool to ship to users    │
+│      of the compiler, so they can make tune GC constants for release builds  │
+│      of their own software.                                                  │
+│                                                                              │
+│   3. The parallel variant pins threads to CPUs because many modern machines  │
+│      have heterogenous core clusters, so different CPUs will have different  │
+│      L1d sizes. However, this is only likely to have reasonable performance  │
+│      with a work-stealing green threading runtime in userspace, and even in  │
+│      that case may have worse mutator performance than not pinning threads.  │
+│                                                                              │
+│      Does pinning and having optimally-cache-sized nurseries really improve  │
+│      performance that much versus uniformly sizing the nurseries and not     │
+│      pinning?                                                                │
+│                                                                              │
+│   4. Parallelizing the “update object fields to refer to the new location”   │
+│      pass of the young-heap collection requires the “assign forwarding       │
+│      pointers to objects” pass to estimate which parts of the young-heap are │
+│      assigned to which thread, since they must be aligned to the boundaries  │
+│      between objects.                                                        │
+│                                                                              │
+│      In the above design, the passes are serialized, such that only after    │
+│      all threads have been assigned to parts of the region do any of them    │
+│      start updating object fields. Instead, we could imagine giving threads  │
+│      non-uniformly-sized parts of the region, and having them start as soon  │
+│      as they are assigned.                                                   │
+│                                                                              │
+│      Threads could also start the “shift all objects down” pass as soon as   │
+│      all threads before them have finished the “update object fields” pass.  │
+│                                                                              │
+│      i. Does interleaving the first two passes improve performance?          │
+│                                                                              │
+│      ii. Should threads be given (approximately) equally-sized portions of   │
+│          the region, or should they follow some curve, so threads that start │
+│          earlier are given more work to do?                                  │
+│                                                                              │
+│          This curve’s parameters should probably be learned as above.        │
+│                                                                              │
+│      iii. Does interleaving the last two passes improve performance? This    │
+│           means shifting objects down while some (later) objects’ fields are │
+│           still being updated.                                               │
+│                                                                              │
+│      iv. Does interleaving all three passes improve performance?             │
+╰──────────────────────────────────────────────────────────────────────────────╯
diff --git a/misc/leaf-classes.lisp b/misc/leaf-classes.lisp
new file mode 100644
index 0000000..711890f
--- /dev/null
+++ b/misc/leaf-classes.lisp
@@ -0,0 +1,27 @@
+(in-package :cl-user)
+
+(defun class-in-cl-package (class)
+  (eql (symbol-package (class-name class)) (find-package 'cl)))
+
+(defun class-has-no-standard-subclasses (class)
+  (notany #'class-in-cl-package (sb-mop:class-direct-subclasses class)))
+
+(let ((queue nil)
+      (set nil))
+  (do-external-symbols (sym 'cl)
+    (let ((class (find-class sym nil)))
+      (when class
+        (push class queue))))
+  (loop
+    while queue
+    do (let ((class (pop queue)))
+         (cond
+           ((class-has-no-standard-subclasses class)
+            (push class set)))))
+  (flet ((name (class)
+           (symbol-name (class-name class))))
+    (setf set (stable-sort set #'string-lessp :key #'name))
+    (setf set (remove-duplicates set :key #'class-name)))
+  (loop
+    for class in set
+    do (format t "~a~%" class)))
diff --git a/src/Makefile b/src/Makefile
index 432b331..07f1cb5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -12,7 +12,7 @@ LDFLAGS =
 
 CC = $(DEVKITPRO)/devkitARM/bin/arm-none-eabi-gcc
 
-CFLAGS_AUTO = -D__3DS__ -ffunction-sections -flto -g -march=armv6k -mtune=mpcore -mfloat-abi=hard -mtp=soft -mword-relocations -Og -std=c11 -Wall -Werror=implicit-function-declaration
+CFLAGS_AUTO = -D__3DS__ -ffunction-sections -flto -g -march=armv6k -mtune=mpcore -mfloat-abi=hard -mtp=soft -mword-relocations -O3 -std=c11 -Wall -Werror=implicit-function-declaration
 LDFLAGS_AUTO = -L$(DEVKITPRO)/libctru/lib -specs=3dsx.specs
 LDLIBS_AUTO = -lctru -lm
 
@@ -38,9 +38,7 @@ all: imb3.3dsx
 imb3.3dsx: imb3.elf
 	3dsxtool $< $@
 
-imb3.elf: $(OBJS) obj/gc/gc.o
-	$(CC) $(CFLAGS_ALL) $(LDFLAGS_ALL) -o $@ $^ $(LDLIBS_ALL)
-imb3-debug.elf: $(OBJS) obj/gc/gc-debug.o
+imb3.elf: $(OBJS) obj/gc/gc.o obj/platform/3ds.o
 	$(CC) $(CFLAGS_ALL) $(LDFLAGS_ALL) -o $@ $^ $(LDLIBS_ALL)
 
 obj/%.o: $(srcdir)/%.c
diff --git a/src/gc.h b/src/gc.h
index dff3b79..d7ce8b0 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -2,15 +2,21 @@
 #define IMB3_GC_H
 
 #include "value.h"
-#include <stddef.h>
 
 void gc_init(void);
 void gc_collect(void);
 
 struct object;
-struct value gc_read_value_slot(struct object *, size_t);
-uintptr_t gc_read_untraced_slot(struct object *, size_t);
-uint8_t gc_read_untraced_byte(struct object *, size_t);
+
+struct object *gc_alloc(size_t value_slot_count, size_t untraced_slot_count);
+struct object *gc_alloc_compiled_function(size_t value_slot_count,
+                                          size_t untraced_slot_count);
+struct object *gc_alloc_hashtable_eq(size_t value_slot_count,
+                                     size_t untraced_slot_count);
+
+struct value gc_read_value_slot(const struct object *, size_t);
+uintptr_t gc_read_untraced_slot(const struct object *, size_t);
+uint8_t gc_read_untraced_byte(const struct object *, size_t);
 void gc_write_value_slot(struct object *, size_t, struct value);
 void gc_write_untraced_slot(struct object *, size_t, uintptr_t);
 void gc_write_untraced_byte(struct object *, size_t, uint8_t);
diff --git a/src/gc/gc-debug.c b/src/gc/gc-debug.c
deleted file mode 100644
index e69de29..0000000
--- a/src/gc/gc-debug.c
+++ /dev/null
diff --git a/src/gc/gc.c b/src/gc/gc.c
index f53fd79..1c0daec 100644
--- a/src/gc/gc.c
+++ b/src/gc/gc.c
@@ -1,18 +1,12 @@
+#include "../gc.h"
+#include "../platform.h"
 #include "../util.h"
 #include <stdbool.h>
 #include <stddef.h>
-#include <stdio.h>
+#include <stdint.h>
 #include <stdlib.h>
 
 /**
- * These cache-size details are from
- * https://www.copetti.org/writings/consoles/nintendo-3ds/. We call the second
- * level of cache (only available on N3DS) L3, since this GC design only cares
- * about L1D and L3.
- */
-static const size_t L1D_SIZE = 16 * 1024, L3_SIZE = 2 * 1024 * 1024;
-
-/**
  * Constants that should probably be learned.
  */
 static const size_t NURSERY_L1D_SIZE_FACTOR = 1;
@@ -43,19 +37,19 @@ enum object_kind {
   OBJECT_UNTRACED,
 
   /**
+   * A wrapper for a pointer to a code address. When this object gets collected,
+   * calls `free_code` on its first untraced slot.
+   */
+  OBJECT_COMPILED_FUNCTION,
+
+  /**
    * A hashtable keyed by the addresses. This behaves like a normal object, but
    * the first untraced slot gets set to the `uintptr_t` 1 whenever a collection
    * traces the hashtable.
    *
    * The hashtable uses this information to rehash after each collection.
    */
-  OBJECT_EQ_HASHTABLE,
-
-  /**
-   * A wrapper for a pointer to a code address. When this object gets collected,
-   * calls `free_code` on its first untraced slot.
-   */
-  OBJECT_COMPILED_FUNCTION,
+  OBJECT_HASHTABLE_EQ,
 };
 
 /**
@@ -72,11 +66,11 @@ struct object_size {
 /**
  * Returns the number of slots for values the object has.
  */
-static size_t object_value_slots(struct object_size size) {
+static size_t object_size_value_slots(struct object_size size) {
   switch (size.kind) {
   case OBJECT_NORMAL:
-  case OBJECT_EQ_HASHTABLE:
   case OBJECT_COMPILED_FUNCTION:
+  case OBJECT_HASHTABLE_EQ:
     return size.bits >> ((4 * sizeof(size_t)) - 3);
   case OBJECT_UNTRACED:
     return 0;
@@ -86,11 +80,11 @@ static size_t object_value_slots(struct object_size size) {
 /**
  * Returns the number of untraced slots the object has.
  */
-static size_t object_untraced_slots(struct object_size size) {
+static size_t object_size_untraced_slots(struct object_size size) {
   switch (size.kind) {
   case OBJECT_NORMAL:
-  case OBJECT_EQ_HASHTABLE:
   case OBJECT_COMPILED_FUNCTION:
+  case OBJECT_HASHTABLE_EQ:
     return size.bits & ((1 << ((4 * sizeof(size_t)) - 3)) - 1);
   case OBJECT_UNTRACED:
     return size.bits;
@@ -106,6 +100,20 @@ struct object {
 };
 
 /**
+ * Returns the number of slots for values the object has.
+ */
+static size_t object_value_slots(const struct object *obj) {
+  return object_size_value_slots(((struct object_size *)obj)[-1]);
+}
+
+/**
+ * Returns the number of untraced slots the object has.
+ */
+static size_t object_untraced_slots(const struct object *obj) {
+  return object_size_untraced_slots(((struct object_size *)obj)[-1]);
+}
+
+/**
  * This is the layout of an object in the nursery.
  */
 struct nursery_object {
@@ -120,8 +128,8 @@ struct nursery_object {
  * with one.
  */
 struct young_heap_object {
-  struct object_size size;
   size_t fwd;
+  struct object_size size;
   uintptr_t slots[];
 };
 
@@ -199,10 +207,10 @@ static struct old_heap_block old_heap_sentinels[SIZE_CLASS_COUNT];
 void gc_init(void) {
   // Allocate the nursery and young-heap.
   nursery_bottom = nursery_next =
-      (uintptr_t)malloc(L1D_SIZE * NURSERY_L1D_SIZE_FACTOR);
+      (uintptr_t)malloc(get_l1d_size() * NURSERY_L1D_SIZE_FACTOR);
   assume(nursery_bottom);
   young_heap_bottom = young_heap_next =
-      (uintptr_t)malloc(L3_SIZE * YOUNG_HEAP_L3_SIZE_FACTOR);
+      (uintptr_t)malloc(get_l3_size() * YOUNG_HEAP_L3_SIZE_FACTOR);
   assume(young_heap_bottom);
 
   // Self-link the old-heap.
@@ -211,3 +219,48 @@ void gc_init(void) {
         &old_heap_sentinels[i];
   }
 }
+
+struct object *gc_alloc(size_t value_slot_count, size_t untraced_slot_count) {
+  todo("gc_alloc");
+}
+
+struct object *gc_alloc_compiled_function(size_t value_slot_count,
+                                          size_t untraced_slot_count) {
+  todo("gc_alloc_compiled_function");
+}
+
+struct object *gc_alloc_hashtable_eq(size_t value_slot_count,
+                                     size_t untraced_slot_count) {
+  todo("gc_alloc_hashtable_eq");
+}
+
+struct value gc_read_value_slot(const struct object *obj, size_t slot_index) {
+  assume(slot_index < object_value_slots(obj));
+  todo("gc_read_value_slot");
+}
+
+uintptr_t gc_read_untraced_slot(const struct object *obj, size_t slot_index) {
+  assume(slot_index < object_untraced_slots(obj));
+  todo("gc_read_untraced_slot");
+}
+
+uint8_t gc_read_untraced_byte(const struct object *obj, size_t byte_index) {
+  todo("gc_read_untraced_byte");
+}
+
+void gc_write_value_slot(struct object *obj, size_t slot_index,
+                         struct value value) {
+  assume(slot_index < object_value_slots(obj));
+  todo("gc_write_value_slot");
+}
+
+void gc_write_untraced_slot(struct object *obj, size_t slot_index,
+                            uintptr_t value) {
+  assume(slot_index < object_untraced_slots(obj));
+  todo("gc_write_untraced_slot");
+}
+
+void gc_write_untraced_byte(struct object *obj, size_t byte_index,
+                            uint8_t byte) {
+  todo("gc_write_untraced_byte");
+}
diff --git a/src/main.c b/src/main.c
index c8a1115..7be1fa7 100644
--- a/src/main.c
+++ b/src/main.c
@@ -7,6 +7,9 @@
 int main(int argc, char **argv) {
   gfxInit(GSP_BGR8_OES, GSP_BGR8_OES, false);
 
+  gc_init();
+  bootstrap();
+
   // Initialize console on top screen. Using NULL as the second argument tells
   // the console library to use the internal console structure as current one
   consoleInit(GFX_TOP, NULL);
@@ -20,8 +23,6 @@ int main(int argc, char **argv) {
 
   printf("\x1b[29;16HPress Start to exit.\n");
 
-  gc_init();
-
   // Main loop
   while (aptMainLoop()) {
     // Scan all the inputs. This should be done once for each frame
diff --git a/src/platform.h b/src/platform.h
new file mode 100644
index 0000000..2dbfd26
--- /dev/null
+++ b/src/platform.h
@@ -0,0 +1,13 @@
+#ifndef IMB3_PLATFORM_H
+#define IMB3_PLATFORM_H
+
+#include <stddef.h>
+#include <stdnoreturn.h>
+
+size_t get_l1d_size(void);
+size_t get_l3_size(void);
+
+void panic_begin(void);
+noreturn void panic_end(void);
+
+#endif // IMB3_PLATFORM_H
diff --git a/src/platform/3ds.c b/src/platform/3ds.c
new file mode 100644
index 0000000..888baf7
--- /dev/null
+++ b/src/platform/3ds.c
@@ -0,0 +1,33 @@
+#include <3ds.h>
+
+#include "../platform.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+/**
+ * These cache-size details are from
+ * https://www.copetti.org/writings/consoles/nintendo-3ds/. We call the second
+ * level of cache (only available on N3DS) L3, since the GC design only cares
+ * about L1D and L3.
+ */
+size_t get_l1d_size(void) { return 16 * 1024; }
+size_t get_l3_size(void) { return 2 * 1024 * 1024; }
+
+void panic_begin(void) { consoleInit(GFX_TOP, NULL); }
+
+noreturn void panic_end(void) {
+  printf("\nPress Start to exit.\n");
+
+  while (aptMainLoop()) {
+    hidScanInput();
+    u32 keys = hidKeysDown();
+    if (keys & KEY_START)
+      break;
+    gfxFlushBuffers();
+    gfxSwapBuffers();
+    gspWaitForVBlank();
+  }
+
+  gfxExit();
+  exit(1);
+}
diff --git a/src/util.c b/src/util.c
index c219831..14a213a 100644
--- a/src/util.c
+++ b/src/util.c
@@ -1,43 +1,12 @@
 #include "util.h"
+#include "platform.h"
 #include <stdarg.h>
 #include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __3DS__
-
-#include <3ds.h>
-
-static void panic_begin(void) { consoleInit(GFX_TOP, NULL); }
-
-static noreturn void panic(void) {
-  printf("\nPress Start to exit.\n");
-
-  while (aptMainLoop()) {
-    hidScanInput();
-    u32 keys = hidKeysDown();
-    if (keys & KEY_START)
-      break;
-    gfxFlushBuffers();
-    gfxSwapBuffers();
-    gspWaitForVBlank();
-  }
-
-  gfxExit();
-  exit(1);
-}
-
-#else
-
-static void panic_begin(void) {}
-
-static noreturn void panic(void) { abort(); }
-
-#endif
 
 noreturn void assume__failed(const char *file, int line, const char *expr) {
   panic_begin();
   fprintf(stderr, "%s:%d: assertion failed: %s\n", file, line, expr);
-  panic();
+  panic_end();
 }
 
 noreturn void todo__impl(const char *file, int line, const char *fmt, ...) {
@@ -47,5 +16,5 @@ noreturn void todo__impl(const char *file, int line, const char *fmt, ...) {
   va_start(ap, fmt);
   vprintf(fmt, ap);
   va_end(ap);
-  panic();
+  panic_end();
 }
diff --git a/src/value.c b/src/value.c
index 66d388c..07af689 100644
--- a/src/value.c
+++ b/src/value.c
@@ -1 +1,112 @@
 #include "value.h"
+#include "gc.h"
+
+enum builtin {
+  BUILTIN_BUILTIN_CLASS_BIGNUM,
+  BUILTIN_BUILTIN_CLASS_BUILTIN_CLASS,
+  BUILTIN_BUILTIN_CLASS_BUILTIN_FUNCTION,
+  BUILTIN_BUILTIN_CLASS_BUILTIN_OBJECT,
+  BUILTIN_BUILTIN_CLASS_CHARACTER,
+  BUILTIN_BUILTIN_CLASS_COMPILED_FUNCTION,
+  BUILTIN_BUILTIN_CLASS_CONS,
+  BUILTIN_BUILTIN_CLASS_DIRECT_SLOT_DEFINITION,
+  BUILTIN_BUILTIN_CLASS_EFFECTIVE_SLOT_DEFINITION,
+  BUILTIN_BUILTIN_CLASS_FIXNUM,
+  BUILTIN_BUILTIN_CLASS_GENERIC_FUNCTION,
+  BUILTIN_BUILTIN_CLASS_HASHTABLE_EQ,
+  BUILTIN_BUILTIN_CLASS_HASHTABLE_EQUAL,
+  BUILTIN_BUILTIN_CLASS_METHOD,
+  BUILTIN_BUILTIN_CLASS_NULL,
+  BUILTIN_BUILTIN_CLASS_PACKAGE,
+  BUILTIN_BUILTIN_CLASS_STANDARD_CLASS,
+  BUILTIN_BUILTIN_CLASS_STANDARD_OBJECT,
+  BUILTIN_BUILTIN_CLASS_STRING,
+  BUILTIN_BUILTIN_CLASS_SYMBOL,
+  BUILTIN_BUILTIN_CLASS_THREADED_FUNCTION,
+  BUILTIN_BUILTIN_CLASS_VECTOR_VALUE_ADJ_FILL,
+  BUILTIN_BUILTIN_CLASS_VECTOR_VALUE_NOADJ_FILL,
+  BUILTIN_BUILTIN_CLASS_VECTOR_VALUE_NOADJ_NOFILL,
+
+  BUILTIN_FUNCTION_CLASS_DIRECT_SLOTS,
+  BUILTIN_FUNCTION_CLASS_DIRECT_SUBCLASSES,
+  BUILTIN_FUNCTION_CLASS_DIRECT_SUPERCLASSES,
+  BUILTIN_FUNCTION_CLASS_NAME,
+  BUILTIN_FUNCTION_CLASS_PRECEDENCE_LIST,
+  BUILTIN_FUNCTION_CLASS_SLOTS,
+  BUILTIN_FUNCTION_EQ,
+  BUILTIN_FUNCTION_EQUAL,
+  BUILTIN_FUNCTION_GETHASH,
+  BUILTIN_FUNCTION_GETSLOT,
+  BUILTIN_FUNCTION_SETHASH,
+  BUILTIN_FUNCTION_SETSLOT,
+  BUILTIN_FUNCTION_SLOT_LOCATION,
+  BUILTIN_FUNCTION_SLOT_NAME,
+  BUILTIN_FUNCTION_SLOT_TYPE,
+  BUILTIN_FUNCTION_VECTOR_GET,
+  BUILTIN_FUNCTION_VECTOR_PUSH,
+  BUILTIN_FUNCTION_VECTOR_PUSH_EXTEND,
+  BUILTIN_FUNCTION_VECTOR_SET,
+
+  BUILTIN_VARIABLE_DSTACK,
+  BUILTIN_VARIABLE_RSTACK,
+
+  BUILTINS_COUNT,
+};
+
+static struct value builtins[BUILTINS_COUNT];
+
+enum builtin_class_slot_index {
+  BUILTIN_CLASS__NAME,
+  BUILTIN_CLASS__FINALIZED,
+  BUILTIN_CLASS__DIRECT_SUPERCLASSES,
+  BUILTIN_CLASS__DIRECT_SUBCLASSES,
+  BUILTIN_CLASS__PRECEDENCE_LIST,
+
+  BUILTIN_CLASS_SLOT_COUNT,
+};
+
+enum standard_class_slot_index {
+  STANDARD_CLASS__NAME,
+  STANDARD_CLASS__FINALIZED,
+  STANDARD_CLASS__DIRECT_SUPERCLASSES,
+  STANDARD_CLASS__DIRECT_SUBCLASSES,
+  STANDARD_CLASS__PRECEDENCE_LIST,
+  STANDARD_CLASS__DIRECT_SLOTS,
+  STANDARD_CLASS__SLOTS,
+
+  STANDARD_CLASS_SLOT_COUNT,
+};
+
+static void push(struct value value) {}
+
+static struct value pop(void) { return (struct value){.bits = 0}; }
+
+struct value alloc_builtin_object(struct value class, size_t value_slot_count,
+                                  size_t untraced_slot_count) {
+  struct object *obj;
+  if (class.bits == builtins[BUILTIN_BUILTIN_CLASS_COMPILED_FUNCTION].bits)
+    obj = gc_alloc_compiled_function(value_slot_count, untraced_slot_count);
+  else if (class.bits == builtins[BUILTIN_BUILTIN_CLASS_HASHTABLE_EQ].bits)
+    obj = gc_alloc_hashtable_eq(value_slot_count, untraced_slot_count);
+  else
+    obj = gc_alloc(value_slot_count, untraced_slot_count);
+  gc_write_value_slot(obj, 0, class);
+  return (struct value){
+      .bits = (uintptr_t)obj | TAG_BUILTIN_OBJECT,
+  };
+}
+
+void bootstrap(void) {
+  // Since it is its own class, standard-class needs to be constructed
+  // manually.
+  {
+    struct object *obj = gc_alloc(STANDARD_CLASS_SLOT_COUNT + 1, 0);
+    builtins[BUILTIN_BUILTIN_CLASS_STANDARD_CLASS] = (struct value){
+        .bits = (uintptr_t)obj | TAG_BUILTIN_OBJECT,
+    };
+    gc_write_value_slot(obj, 0, builtins[BUILTIN_BUILTIN_CLASS_STANDARD_CLASS]);
+    // TODO
+  }
+
+  //
+}
diff --git a/src/value.h b/src/value.h
index a932514..3795486 100644
--- a/src/value.h
+++ b/src/value.h
@@ -1,6 +1,7 @@
 #ifndef IMB3_VALUE_H
 #define IMB3_VALUE_H
 
+#include <stddef.h>
 #include <stdint.h>
 
 struct value {
@@ -35,4 +36,15 @@ enum value_tag {
   TAG_STANDARD_OBJECT = 0b110,
 };
 
+/**
+ * Allocates a builtin-object with the given class and slot count.
+ */
+struct value make_builtin_object(struct value class, size_t value_slot_count,
+                                 size_t untraced_slot_count);
+
+/**
+ * Bootstraps the class heirarchy. This should only be called once.
+ */
+void bootstrap(void);
+
 #endif // IMB3_VALUE_H