9 files changed, 599 insertions, 0 deletions
diff --git a/src/boot/kernel_stage1.S b/src/boot/kernel_stage1.S
new file mode 100644
index 0000000..e770513
--- /dev/null
+++ b/src/boot/kernel_stage1.S
@@ -0,0 +1,168 @@
+/* arm mode, cortex-a7 compatibility
+ *
+ * _boot is entry point for the kernel.
+ *
+ * Kernel copies it's embedded stage 2 to address 0x0 and jumps to
+ * it (to the reset handler). Registers r0 - r2 are arguments for
+ * the kernel, but we're not using them for now.
+ *
+ * This file is based on (and almost identical with) loader_stage1.S
+ */
+
+.global _boot
+_boot:
+        // Only let the first core execute
+        mrc p15, 0, r3, c0, c0, 5
+        and r3, r3, #3
+        cmp r3, #0
+        beq proceed
+	// this is a kind of blef - races can theoretically still
+	// occur when the main core overwrites this part of memory
+	wfe
+
+	// we'll use the size of stage1 to determine where we have free
+	// space after it. We'll then copy our atags/fdt there, so
+	// it doesn't get overwritten by stage2 we deploy at 0x0
+atags_magic:
+	.word 0x54410001
+	
+proceed:
+	// load the second word of structure passed to us through r2;
+	// if it's atags, it's second word should be the magic number
+	// Btw, location of ATAGS is always 0x100.
+	ldr r3, [r2, #4]
+	adr r4, atags_magic
+	ldr r4, [r4]
+
+	// compare second word of assumed atags with magic number
+	// to see, if it's really atags and not sth else (i.e. fdt)
+	cmp r3, r4
+
+	// normally at start r0 contains value 0;
+	// value 3 in r0 would tell stage2 code, we found no atags :(
+	movne r0, #3
+	bne stage2_blob_copying
+
+	// if atags was found, copying of it takes place here
+	
+	// the following loop finds, where atags ends
+	// r3 shall point to currently looked-at tag
+	mov r3, r2
+
+find_end_of_atags_loop:
+	// load first word of tag header to r4 (it contains tag size)
+	ldr r4, [r3]
+	// make r3 point at the next tag (by adding 4*tag_size to it)
+	add r3, r4, lsl #2
+
+	// load second word of tag header to r5 (it contains tag type)
+	ldr r5, [r3, #4]
+
+	// if tag value is 0, it is the last tag
+	cmp r5, #0
+	bne find_end_of_atags_loop
+
+	add r3, #8 // make r3 point at the end of last tag
+	sub r3, r2 // get atags size in r3
+	
+	// at this pont r2 and r3 point at start and size of atags,
+	// respectively; now we'll compute, where we're going to have
+	// free space to put atags in; we want to put atags either
+	// right after our blob or, if if it doesn't fit between
+	// blob end and the address stage1 is loaded at, after stage1
+
+	// get blob size to r5
+	adr r5, blob_size
+	ldr r5, [r5]
+
+	// we could only copy atags to a 4-aligned address
+	mov r6, #4
+	bl aling_r5_to_r6
+
+	// compute where atags copied right after blob would end
+	add r6, r5, r3
+	// we can only overwrite stuff before the copying loop
+	adr r7, copy_atags_loop
+	cmp r6, r7
+	ble copy_atags
+
+	// atags wouldn't fit - use memory after stage1 as destination
+	adr r5, _boot
+	adr r6, stage1_size
+	ldr r6, [r6]
+	add r5, r6
+	mov r6, #4
+	bl aling_r5_to_r6
+	
+copy_atags:
+	// now copy atags (r2 - atags start; r3 - atags size;
+	// r5 - destination; r4 - iterator; r6 - buffor)
+	mov r4, #0
+
+copy_atags_loop:
+	ldr r6, [r2, r4]
+	str r6, [r5, r4]
+	add r4, #4
+	cmp r4, r3
+	blo copy_atags_loop
+
+	mov r2, r5 // place the new atags address in r2
+	b stage2_blob_copying // atags stuff done; proceed
+
+// mini-function, that does what the label says; clobbers r7
+aling_r5_to_r6:
+	sub r5, #1
+	sub r7, r6, #1
+	bic r5, r7
+	add r5, r6
+	mov pc, lr
+
+	
+stage2_blob_copying: // copy stage2 of the kernel to address 0x0
+
+	// first, load address of stage2_start to r3 (a PIC way)
+	adr r3, stage2_start
+
+	// load destination address for stage2 code to r4
+	mov r4, #0
+
+	// load blob size to r5
+	// The size might get too big for an immediate value, so
+	// we load it from memory.
+	adr r5, blob_size
+	ldr r5, [r5]
+
+	// r6 is the counter - counts the bytes copied
+	mov r6, #0
+
+	// This initial piece of code might get overwritten when we
+	// copy stage2, so the actual copying loop shall be after
+	// stage2 blob. We want this asm code to be PIC, so we're
+	// computing address of stage2_end into r7.
+	add r7, r3, r5
+	bx r7
+
+blob_size:
+	.word stage2_end - stage2_start
+stage1_size:
+	.word stage1_end - _boot
+
+.align 4
+stage2_start:
+	.incbin "kernel_stage2.img"
+stage2_end:
+
+	// each word of the blob is loaded to r7 and stored
+	// from r7 to it's destination in a loop
+loop:
+	ldr r7, [r3, r6]
+	str r7, [r4, r6]
+	add r6, r6, #4
+	cmp r6, r5
+	blo loop
+
+        // Call stage2 of the kernel (branch to 0x0,
+	// which is the reset handler).
+        bx r4
+	
+stage1_end:
diff --git a/src/boot/kernel_stage1.ld b/src/boot/kernel_stage1.ld
new file mode 100644
index 0000000..3130634
--- /dev/null
+++ b/src/boot/kernel_stage1.ld
@@ -0,0 +1,27 @@
+ENTRY(_boot) /* defined in boot.S; qemu needs it to run elf file */
+
+/* Code starts at  0x8000 - that's where RPis in 32-bit mode load
+ * kernel at. My experiments do, however, show, that qemu emulating
+ * RPi2 loads the kernel at 0x10000! (took some pain to find out).
+ * rpi-open-firmware, on the other hand, loads kernel at 0x2000000!
+ * This is not really a problem, since:
+ *   1. We can use our bootloader to load the kernel at 0x8000
+ *   2. We've rewritten stage 1 of both bootloader and kernel in
+ *      careful assembly, so that they should work regardless of
+ *      where they are loaded.
+ *   3. In qemu, we can load kernel.elf instead of raw binary
+ *      (qemu will do the right thing then)
+ */
+
+SECTIONS
+{
+
+	. = 0x8000;
+
+	__start = .;
+	.kernel_stage1 :
+	{
+		KEEP(kernel_stage1.o)
+	}
+	__end = .;
+}
diff --git a/src/boot/kernel_stage2.ld b/src/boot/kernel_stage2.ld
new file mode 100644
index 0000000..9411ca2
--- /dev/null
+++ b/src/boot/kernel_stage2.ld
@@ -0,0 +1,80 @@
+/* This sesond stage of the kernel is run from address 0x0 */
+
+TRANSLATION_TABLE_SIZE = 4096 * 4;
+SECTIONS_LIST_SIZE = 4096 * 8;
+MMU_SECTION_SIZE = 1 << 20;
+
+SECTIONS
+{
+
+	. = 0x0;
+
+	__start = .;
+	.interrupt_vector :
+	{
+		KEEP(interrupt_vector.o)
+	}
+	. = ALIGN(4);
+	.embedded_ramfs :
+	{
+		ramfs_embeddable.o
+	}
+	.rest_of_kernel :
+	{
+		*(.text)
+		*(.data)
+		*(.rodata)
+		*(.bss)
+		*(/COMMON/)
+		*(*)
+	}
+	__end = .;
+
+	. = ALIGN(1 << 14);
+	
+	.translation_table (NOLOAD) :
+	{
+		_translation_table_start = .;
+		
+		. = . + TRANSLATION_TABLE_SIZE;
+
+		_translation_table_end = .;
+	}
+
+	.sections_list (NOLOAD) :
+	{
+		_sections_list_start = .;
+		
+		. = . + SECTIONS_LIST_SIZE;
+
+		_sections_list_end = .;
+	}
+
+	. = ALIGN(1 << 20);
+	. = . + MMU_SECTION_SIZE;
+
+	.stack (NOLOAD) :
+	{
+		_stack_start = .;
+		
+		_fiq_stack_start = .;
+		
+		. = . + (1 << 18);
+		
+		_fiq_stack_top = .;
+		
+		_irq_stack_start = .;
+		
+		. = . + (1 << 18);
+		
+		_irq_stack_top = .;
+		
+		_supervisor_stack_start = .;
+		
+		. = . + (1 << 19);
+		
+		_supervisor_stack_top = .;
+		
+		_stack_end = .;
+	}
+}
diff --git a/src/boot/loader_stage1.S b/src/boot/loader_stage1.S
new file mode 100644
index 0000000..69d78c5
--- /dev/null
+++ b/src/boot/loader_stage1.S
@@ -0,0 +1,55 @@
+/* arm mode, cortex-a7 compatibility
+ *
+ * _boot is entry point for the loader.
+ *
+ * Loader copies it's embedded stage 2 to address 0x4000
+ * and jumps to it. Registers r0 - r2 are arguments for the kernel
+ * and should be left intact.
+ */
+	
+.global _boot
+_boot:
+        // Only let the first core execute
+        mrc p15, 0, r3, c0, c0, 5
+        and r3, r3, #3
+        cmp r3, #0
+        beq proceed
+	// this is a kind of blef - races can theoretically still occur
+	// when the main core overwrites this part of memory
+	wfe
+
+proceed:
+	// copy stage2 of the loader to address 0x4000
+
+	// first, load address of stage2_start to r3 (a PIC way)
+	adr r3, stage2_start
+
+	// load destination address for stage2 code to r4
+	mov r4, #0x4000
+
+	// load blob size to r5
+	mov r5, #(stage2_end - stage2_start)
+
+	// r6 is the counter - counts the bytes copied
+	mov r6, #0
+
+	// each word of the blob is loaded to r7 and stored
+	// from r7 to it's destination in a loop
+loop:
+	ldr r7, [r3, r6]
+	str r7, [r4, r6]
+	add r6, r6, #4
+	cmp r6, r5
+	blo loop
+	
+        // Initialize the stack
+	// _stack_top is defined in loader_stage1_linker.ld
+	ldr sp, =_stack_top
+	
+        // Call stage2 of the loader (branch to 0x4000)
+        bx r4
+
+.align 4
+stage2_start:
+	.incbin "loader_stage2.img"
+stage2_end:
diff --git a/src/boot/loader_stage1_linker.ld b/src/boot/loader_stage1_linker.ld
new file mode 100644
index 0000000..711fcbf
--- /dev/null
+++ b/src/boot/loader_stage1_linker.ld
@@ -0,0 +1,16 @@
+ENTRY(_boot)
+ 
+SECTIONS
+{
+	/* see linker.ld for details */
+	. = 0x2000000;
+
+	__start = .;
+	loader_stage1 :
+	{
+		KEEP(loader_stage1.o)
+	}
+	__end = .;
+
+    _stack_top = 0x8000;
+}
diff --git a/src/boot/loader_stage2.c b/src/boot/loader_stage2.c
new file mode 100644
index 0000000..fc3ae1c
--- /dev/null
+++ b/src/boot/loader_stage2.c
@@ -0,0 +1,33 @@
+#include <stddef.h>
+#include <stdint.h>
+#include "uart.h"
+#include "io.h"
+#include "global.h"
+
+void *const kernel_load_addr = ((void*) 0x8000);
+
+void _stage2_main(uint32_t r0, uint32_t r1, uint32_t atags)
+{
+  uart_init();
+
+  // get kernel size via uart (little endian)
+  uint32_t b0, b1, b2, b3;
+  
+  b0 = getchar();
+  b1 = getchar();
+  b2 = getchar();
+  b3 = getchar();
+  
+  uint32_t kernel_size = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
+
+  // load kernel at kernel_load_addr
+  char *dst = kernel_load_addr, *end = dst + kernel_size;
+
+  while (dst < end)
+    *(dst++) = getchar();
+
+  // jump to kernel
+  ((void(*)(uint32_t, uint32_t, uint32_t)) kernel_load_addr)
+    (r0, r1, atags);
+}
+
diff --git a/src/boot/loader_stage2_linker.ld b/src/boot/loader_stage2_linker.ld
new file mode 100644
index 0000000..33e79e9
--- /dev/null
+++ b/src/boot/loader_stage2_linker.ld
@@ -0,0 +1,16 @@
+ENTRY(_stage2_main)
+ 
+SECTIONS
+{
+	/* see loader_stage1.S for details */
+	. = 0x4000;
+
+	__start = .;
+	loader_stage2 :
+	{
+		KEEP(loader_stage2.o(.text))
+		loader_stage2.o
+		uart.o
+	}
+	__end = .;
+}
diff --git a/src/boot/psr.h b/src/boot/psr.h
new file mode 100644
index 0000000..f300a7a
--- /dev/null
+++ b/src/boot/psr.h
@@ -0,0 +1,88 @@
+#ifndef PSR_H
+#define PSR_H
+
+#include <stdint.h>
+
+enum execution_mode {
+  MODE_USER       = 0b10000,
+  MODE_FIQ        = 0b10001,
+  MODE_IRQ        = 0b10010,
+  MODE_SUPERVISOR = 0b10011,
+  MODE_MONITOR    = 0b10110,
+  MODE_ABORT      = 0b10111,
+  MODE_HYPERVISOR = 0b11010,
+  MODE_UNDEFINED  = 0b11011,
+  MODE_SYSTEM     = 0b11111,
+};
+
+typedef union
+{
+  uint32_t raw;
+  struct
+  {
+    uint32_t M_4_0      : 5; // bits 4:0
+    uint32_t T          : 1; // bit  5
+    uint32_t F          : 1; // bit  6
+    uint32_t I          : 1; // bit  7
+    uint32_t A          : 1; // bit  8
+    uint32_t E          : 1; // bit  9
+    uint32_t IT_7_2     : 6; // bits 15:10
+    uint32_t GE_3_0     : 4; // bits 19:16
+    uint32_t Bits_23_20 : 4; // bits 23:20
+    uint32_t J          : 1; // bit  24
+    uint32_t IT_1_0     : 2; // bits 26:25
+    uint32_t Q          : 1; // bit  27
+    uint32_t V          : 1; // bit  28
+    uint32_t C          : 1; // bit  29
+    uint32_t Z          : 1; // bit  30
+    uint32_t N          : 1; // bit  31
+#define PSR_MODE_4_0                    M_4_0
+#define PSR_THUMB_BIT                   T
+#define PSR_FIQ_MASKK_BIT               F
+#define PSR_IRQ_MASK_BIT                I
+#define PSR_ASYNC_ABORT_MASK_BIT        A
+#define PSR_ENDIANNESS_BIT              E
+#define PSR_IF_THEN_STATE_7_2           IT_7_2
+#define PSR_GREATER_THAN_OR_EQUAL_FLAGS GE_3_0
+    // bits 23:20 are reserved
+#define PSR_JAZELLE_BIT                 J
+#define PSR_IF_THEN_STATE_1_0           IT_1_0
+#define PSR_CUMULATIVE_SATURATION_BIT   Q
+#define PSR_OVERFLOW_CONDITION_BIT      V
+#define PSR_CARRY_CONDITION_BIT         C
+#define PSR_ZERO_CONDITION_BIT          Z
+#define PSR_NEGATIVE_CONDITION_BIT      N
+  } fields;
+} PSR_t;
+
+inline static PSR_t read_CPSR(void)
+{
+  PSR_t CPSR;
+  // get content of current program status register
+  asm("mrs %0, cpsr" : "=r" (CPSR.raw) :: "memory");
+
+  return CPSR;
+}
+
+inline static void write_CPSR(PSR_t CPSR)
+{
+  // set content of current program status register
+  asm("msr cpsr, %0" :: "r" (CPSR.raw) : "memory");
+}
+
+inline static PSR_t read_SPSR(void)
+{
+  PSR_t SPSR;
+  // get content of saved program status register
+  asm("mrs %0, spsr" : "=r" (SPSR.raw) :: "memory");
+  
+  return SPSR;
+}
+
+inline static void write_SPSR(PSR_t SPSR)
+{
+  // set content of saved program status register
+  asm("msr spsr, %0" :: "r" (SPSR.raw));
+}
+
+#endif // PSR_H
diff --git a/src/boot/setup.c b/src/boot/setup.c
new file mode 100644
index 0000000..a96b19e
--- /dev/null
+++ b/src/boot/setup.c
@@ -0,0 +1,116 @@
+#include "uart.h"
+#include "utils/io.h"
+#include "demo_functionality.h"
+#include "paging.h"
+#include "atags.h"
+// for POWER_OF_2() macro... perhaps the macro should be moved
+#include "memory.h"
+#include "armclock.h"
+#include "scheduler.h"
+
+void setup(uint32_t r0, uint32_t machine_type,
+	   struct atag_header *atags)
+{
+  uart_init();
+  
+  // When we attach screen session after loading kernel with socat
+  // we miss kernel's greeting... So we'll make the kernel wait for
+  // one char we're going to send from within screen
+  getchar();
+  
+  puts("Hello, kernel World!");
+
+  prints("ARM machine type: 0x"); printhext(machine_type); puts("");
+
+  uint32_t memory_size = 0;
+  
+  // value 3 introduced by stage1 code means no atags was found
+  if (r0 == 3)
+    {
+      puts ("No ATAGS was found!");
+    }
+  else
+    {
+      prints("ATAGS copied to 0x");
+      printhex((uint32_t) atags); puts("");
+
+      puts("__ ATAGS contents __");
+
+      print_atags(atags);
+
+      puts("__ end of ATAGS contents __");
+
+      memory_size = find_memory_size(atags);
+    }
+    
+  if (memory_size)
+    {
+     char *unit;
+     uint32_t size_in_unit;
+     
+     if (memory_size % POWER_OF_2(10))
+      {
+       unit = "B";
+       size_in_unit = memory_size;
+      }
+     else if (memory_size % POWER_OF_2(20))
+      {
+       unit = "KB";
+       size_in_unit = memory_size / POWER_OF_2(10);
+      }
+     else if (memory_size % POWER_OF_2(30))
+      {
+       unit = "MB";
+       size_in_unit = memory_size / POWER_OF_2(20);
+      }
+     else
+      {
+       unit = "GB";
+       size_in_unit = memory_size / POWER_OF_2(30);
+      }
+     
+     prints ("memory available: ");
+     printdect (size_in_unit);
+     puts (unit);
+    }
+  else
+    {
+      // Most Pis have more, but qemu might give us little
+      puts("Couldn't determine available memory - assuming 192MB");
+      memory_size = 192 * POWER_OF_2(20);
+    }
+
+  // assume we need at least one section for PL0
+  if (memory_size < PRIVILEGED_MEMORY_END + SECTION_SIZE)
+    {
+      puts("Not enough memory to continue");
+      while (1);
+    }
+  
+  // prints some info
+  demo_paging_support();
+
+  // prints some info
+  demo_current_mode();
+
+  setup_pager_structures(memory_size);
+  
+  // prints some info and sets upp translation table, turns on MMU
+  setup_flat_map();
+
+  puts("Initializing clock");
+  // sets some general settings for arm timer
+  armclk_init();
+
+  puts("Setting up scheduler's internal structures");
+  setup_scheduler_structures();
+
+  puts("Switching uart to use irqs");
+
+  // note, that kernel's puts() is still going to use blocking io
+  uart_irq_enable();
+  
+  // prints some info and sets up a section for PL0 code, loads a blob
+  // there, then runs scheduler... never, ever, ever returns
+  demo_setup_PL0();
+}