split kernel into 2 stages; second stage gets copied to 0x0 and runs from there

author: Wojtek Kosior <kwojtus@protonmail.com> 2019-12-28 21:54:42 +0100
committer: Wojtek Kosior <kwojtus@protonmail.com> 2019-12-28 21:54:42 +0100
commit: 700f4c412d42c9b9811269045c0e363a0331bba9 (patch)
tree: 260feed1ca657843d993c1ae73e93f25a17cede1
parent: 80c9af17330ac442a4c3d6d55b4041cbe923e9b4 (diff)
download: rpi-MMU-example-700f4c412d42c9b9811269045c0e363a0331bba9.tar.gz
rpi-MMU-example-700f4c412d42c9b9811269045c0e363a0331bba9.zip
11 files changed, 288 insertions, 149 deletions
diff --git a/Makefile b/Makefile
index 29efa86..68368a5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,10 @@
-CFLAGS=-mcpu=cortex-a7 -ffreestanding -std=gnu11 -Wall -Wextra -O2 -fPIC -I.
+CFLAGS=-mcpu=cortex-a7 -ffreestanding -std=gnu11 -Wall -Wextra -O2 -I.
 ELFFLAGS=-nostdlib -lgcc
 
 ARM_OBJECTS=kernel.o paging.o demo_functionality.o PL0_test.o uart.o loader_stage1.o loader_stage2.o
 
+KERNEL_STAGE2_OBJECTS=setup.o interrupt_vector.o interrupts.o uart.o demo_functionality.o paging.o ramfs_embeddable.o ramfs.o
+
 RAMFS_FILES=PL_0_test.img
 
 all : kernel.img
@@ -10,20 +12,32 @@ all : kernel.img
 %.o : %.c
 	arm-none-eabi-gcc $(CFLAGS) -c $^ -o $@
 
-%.o : %.S
-	arm-none-eabi-as -mcpu=cortex-a7 $^ -o $@
-
 %.img : %.elf
 	arm-none-eabi-objcopy $^ -O binary $@
 
+%.o : %.S
+	arm-none-eabi-as -mcpu=cortex-a7 $^ -o $@
+
 %_embeddable.o : %.img
 	arm-none-eabi-objcopy -I binary -O elf32-littlearm -B arm --rename-section .data=.rodata $^ $@
 
 PL_0_test.elf : PL0_test.o uart.o
 	arm-none-eabi-gcc -T PL0_test.ld -o $@ $(ELFFLAGS) $^
 
-kernel.elf : boot.o kernel.o uart.o demo_functionality.o paging.o interrupt_vector.o interrupts.o ramfs_embeddable.o ramfs.o
-	arm-none-eabi-gcc -T linker.ld -o $@ $(ELFFLAGS) $^
+kernel_stage1.o : kernel_stage1.S kernel_stage2.img
+	arm-none-eabi-as -mcpu=cortex-a7 $< -o $@
+
+kernel.elf : kernel_stage1.ld kernel_stage1.o
+	arm-none-eabi-gcc -T $< -o $@ $(ELFFLAGS) kernel_stage1.o
+
+kernel.img : kernel.elf
+	arm-none-eabi-objcopy $^ -O binary $@
+
+kernel_stage2.elf : kernel_stage2.ld $(KERNEL_STAGE2_OBJECTS)
+	arm-none-eabi-gcc -T $< -o $@ $(ELFFLAGS) $(KERNEL_STAGE2_OBJECTS)
+
+#kernel.elf : boot.o kernel.o uart.o demo_functionality.o paging.o interrupt_vector.o interrupts.o ramfs_embeddable.o ramfs.o
+#	arm-none-eabi-gcc -T linker.ld -o $@ $(ELFFLAGS) $^
 
 loader_stage2.elf : loader_stage2.o uart.o
 	arm-none-eabi-gcc -T loader_stage2_linker.ld -o $@ $(ELFFLAGS) $^
@@ -42,6 +56,9 @@ loader.img : loader.elf
 qemu-elf : kernel.elf
 	qemu-system-arm -m 256 -M raspi2 -serial stdio -kernel $^
 
+qemu-img : kernel.img
+	qemu-system-arm -m 256 -M raspi2 -serial stdio -kernel $^
+
 qemu-bin : loader.img kernel.img pipe_image
 	./pipe_image --stdout | qemu-system-arm -m 256 -M raspi2 -serial stdio -kernel $<
 
diff --git a/boot.S b/boot.S
deleted file mode 100644
index 593ed11..0000000
--- a/boot.S
+++ /dev/null
@@ -1,27 +0,0 @@
-// armv7 mode
- 
-// Entry point for the kernel.
-// r15 -> should begin execution at 0x8000.
-// r0 -> 0x00000000
-// r1 -> 0x00000C42
-// r2 -> 0x00000100 - start of ATAGS
-// preserve these registers as argument for kernel_main
-	
-.global _boot // make entry point label global
-_boot:
-        // Only let the first core execute
-        mrc p15, 0, r3, c0, c0, 5
-        and r3, r3, #3
-        cmp r3, #0
-        beq proceed
-	// this is a kind of blef - races can theoretically still occur
-	// when the main core overwrites this part of memory
-	wfe
-
-proceed:
-        // Initialize the stack (_stack_top is defined in linker.ld)
-        ldr sp, =_stack_top
- 
-        // Call kernel_main
-        ldr r3, =kernel_main
-        bx r3
diff --git a/demo_functionality.c b/demo_functionality.c
index 4b002d6..420639b 100644
--- a/demo_functionality.c
+++ b/demo_functionality.c
@@ -53,7 +53,7 @@ void demo_current_mode(void)
   uart_puts(mode_name);
 }
 
-#define TRANSLATION_TABLE						\
+#define TRANSLATION_TABLE					     \
   ((short_section_descriptor_t volatile*) TRANSLATION_TABLE_BASE)
 
 extern char
@@ -144,29 +144,7 @@ void demo_go_unprivileged(void)
   write_SPSR(new_SPSR);
 
   uart_puts("All ready, jumping to PL0 code\n\r");
-  
+
   asm volatile("ldm %0, {r0 - r15} ^" ::
 	       "r" (PL0_regs));
 }
-
-extern char
-  __interrupts_start,
-  __interrupts_end,
-  __interrupts_size;
-
-extern void (*volatile system_reentry_point)(void);
-
-void system_reentry(void)
-{
-  uart_puts("re-entered system");
-  while(1);
-}
-
-void demo_setup_interrupts(void)
-{
-  system_reentry_point = system_reentry;
-
-  for (size_t i = 0; i < (size_t) &__interrupts_size; i++)
-    ((volatile char*) 0)[i] =
-      (&__interrupts_start)[i];
-}
diff --git a/interrupt_vector.S b/interrupt_vector.S
index d20bf6d..6037b7c 100644
--- a/interrupt_vector.S
+++ b/interrupt_vector.S
@@ -1,22 +1,45 @@
-.section ".interrupts.vector"
-
-.global abort_handler
-.local generic_handler	
-.global _interrupt_vectors
 _interrupt_vectors:
-	b generic_handler
-	b generic_handler
-	b generic_handler
+	b reset_handler_caller
+	b undef_handler_caller
+	b svc_handler_caller
 	b abort_handler_caller
 	b abort_handler_caller
-	b generic_handler
-	b generic_handler
-
-.section ".interrupts.text"
+	b generic_handler_caller
+	b irq_handler_caller
+	b fiq_handler_caller
+	
+reset_handler_caller:
+	ldr sp, =_stack_top
+	ldr r5, =reset_handler
+	bx r5
+	
+undef_handler_caller:
+	ldr sp, =_stack_top
+	ldr r5, =undefined_instruction_vector
+	bx r5
 
-generic_handler:
-	b generic_handler
+svc_handler_caller:
+	ldr sp, =_stack_top
+	ldr r5, =supervisor_call_handler
+	bx r5
+	
 abort_handler_caller:
-	mov sp, #0x8000
+	ldr sp, =_stack_top
 	ldr r5, =abort_handler
 	bx r5
+
+generic_handler_caller:
+	ldr sp, =_stack_top
+	ldr r5, =generic_handler
+	bx r5
+
+irq_handler_caller:
+	ldr sp, =_stack_top
+	ldr r5, =irq_handler
+	bx r5
+
+fiq_handler_caller:
+	ldr sp, =_stack_top
+	ldr r5, =fiq_handler
+	bx r5
+	
diff --git a/interrupts.c b/interrupts.c
index 6952f89..1b0590a 100644
--- a/interrupts.c
+++ b/interrupts.c
@@ -1,10 +1,20 @@
 #include "uart.h"
-/**
-    @brief The undefined instruction interrupt handler
 
-    If an undefined instruction is encountered, the CPU will start
-    executing this function. Just trap here as a debug solution.
-*/
+void setup(void);
+
+void reset_handler(void)
+{
+  static _Bool setup_done;
+  
+  if (!setup_done)
+    setup();
+
+  setup_done = 1;
+
+  // TODO do something here
+  while(1);
+}
+
 void
 __attribute__((interrupt("UNDEF")))
 __attribute__((section(".interrupts.text")))
@@ -17,13 +27,40 @@ undefined_instruction_vector(void)
     }
 }
 
-void __attribute__((section(".interrupts.data")))
-(*system_reentry_point) (void);
+void supervisor_call_handler(void)
+{
+  uart_puts("something svc happened\n\r");
+
+  while(1);
+}
 
 void
 __attribute__((interrupt("ABORT")))
-__attribute__((section(".interrupts.text")))
 abort_handler(void)
 {
-  system_reentry_point();
+  uart_puts("re-entered system\n\r");
+
+  while(1);
+}
+
+void generic_handler(void)
+{
+  uart_puts("something weird happened\n\r");
+
+  while(1);
+}
+
+void irq_handler(void)
+{
+  uart_puts("irq happened\n\r");
+
+  while(1);
+}
+
+void fiq_handler(void)
+{
+  uart_puts("fiq happened\n\r");
+
+  while(1);
 }
+
diff --git a/kernel_stage1.S b/kernel_stage1.S
new file mode 100644
index 0000000..1e0f614
--- /dev/null
+++ b/kernel_stage1.S
@@ -0,0 +1,67 @@
+/* arm mode, cortex-a7 compatibility
+ *
+ * _boot is entry point for the kernel.
+ *
+ * Kernel copies it's embedded stage 2 to address 0x0 and jumps to
+ * it (to the reset handler). Registers r0 - r2 are arguments for
+ * the kernel, but we're not using them for now.
+ *
+ * This file is based on (and almost identical with) loader_stage1.S
+ */
+	
+.global _boot
+_boot:
+        // Only let the first core execute
+        mrc p15, 0, r3, c0, c0, 5
+        and r3, r3, #3
+        cmp r3, #0
+        beq proceed
+	// this is a kind of blef - races can theoretically still
+	// occur when the main core overwrites this part of memory
+	wfe
+
+proceed:
+	// copy stage2 of the kernel to address 0x0
+
+	// first, load address of stage2_start to r3 (a PIC way)
+	adr r3, stage2_start
+
+	// load destination address for stage2 code to r4
+	mov r4, #0
+
+	// load blob size to r5
+	// The size might get too big for an immediate value, so
+	// we load it from memory.
+	adr r5, blob_size
+	ldr r5, [r5]
+
+	// r6 is the counter - counts the bytes copied
+	mov r6, #0
+
+	// This initial piece of code might get overwritten when we
+	// copy stage2, so the actual copying loop shall be after
+	// stage2 blob. We want this asm code to be PIC, so we're
+	// computing address of stage2_end into r7.
+	add r7, r3, r5
+	bx r7
+	
+blob_size:
+	.word stage2_end - stage2_start
+
+.align 4
+stage2_start:
+	.incbin "kernel_stage2.img"
+stage2_end:
+
+	// each word of the blob is loaded to r7 and stored
+	// from r7 to it's destination in a loop
+loop:
+	ldr r7, [r3, r6]
+	str r7, [r4, r6]
+	add r6, r6, #4
+	cmp r6, r5
+	blo loop
+	
+        // Call stage2 of the kernel (branch to 0x0,
+	// which is the reset handler).
+        bx r4
diff --git a/kernel_stage1.ld b/kernel_stage1.ld
new file mode 100644
index 0000000..3130634
--- /dev/null
+++ b/kernel_stage1.ld
@@ -0,0 +1,27 @@
+ENTRY(_boot) /* defined in boot.S; qemu needs it to run elf file */
+
+/* Code starts at  0x8000 - that's where RPis in 32-bit mode load
+ * kernel at. My experiments do, however, show, that qemu emulating
+ * RPi2 loads the kernel at 0x10000! (took some pain to find out).
+ * rpi-open-firmware, on the other hand, loads kernel at 0x2000000!
+ * This is not really a problem, since:
+ *   1. We can use our bootloader to load the kernel at 0x8000
+ *   2. We've rewritten stage 1 of both bootloader and kernel in
+ *      careful assembly, so that they should work regardless of
+ *      where they are loaded.
+ *   3. In qemu, we can load kernel.elf instead of raw binary
+ *      (qemu will do the right thing then)
+ */
+
+SECTIONS
+{
+
+	. = 0x8000;
+
+	__start = .;
+	.kernel_stage1 :
+	{
+		KEEP(kernel_stage1.o)
+	}
+	__end = .;
+}
diff --git a/kernel_stage2.ld b/kernel_stage2.ld
new file mode 100644
index 0000000..d3a23bf
--- /dev/null
+++ b/kernel_stage2.ld
@@ -0,0 +1,52 @@
+/* This sesond stage of the kernel is run from address 0x0 */
+
+TRANSLATION_TABLE_SIZE = 4096 * 4;
+MMU_SECTION_SIZE = 1 << 20;
+
+SECTIONS
+{
+
+	. = 0x0;
+
+	__start = .;
+	.kernel_stage2 :
+	{
+		KEEP(interrupt_vector.o)
+		. = ALIGN(4);
+		ramfs_embeddable.o
+		(*)
+	}
+	__end = .;
+
+	. = ALIGN(1 << 14);
+	
+	.translation_table (NOLOAD) :
+	{
+		_translation_table_start = .;
+		
+		. = . + TRANSLATION_TABLE_SIZE;
+
+		_translation_table_end = .;
+	}
+
+	. = ALIGN(1 << 20);
+	. = . + MMU_SECTION_SIZE;
+	
+	.stack (NOLOAD) :
+	{
+		_stack_start = .;
+		
+		. = . + MMU_SECTION_SIZE;
+
+		_stack_top = .;
+	}
+
+	.unprivileged_memory (NOLOAD) :
+	{
+		_unprivileged_memory_start = .;
+		
+		. = . + MMU_SECTION_SIZE;
+
+		_unprivileged_memory_end = .;
+	}
+}
diff --git a/linker.ld b/linker.ld
deleted file mode 100644
index 444bbf6..0000000
--- a/linker.ld
+++ /dev/null
@@ -1,47 +0,0 @@
-ENTRY(_boot) /* defined in boot.S; qemu needs it to run elf file */
- 
-SECTIONS
-{
-    /* Starts at  0x8000 - that's where RPis in 32-bit mode load */
-    /* kernel at. My experiments do, however, show, that qemu */
-    /* emulating RPi2 loads the kernel at 0x10000! (took some pain */
-    /* to find out). rpi-open-firmware, on the other hand, loads */
-    /* kernel at 0x2000000! */
-    /* This is not really a problem, since: */
-    /*   1. We can use our bootloader to load the kernel at 0x8000 */
-    /*   2. Stage 1 of the bootloader is written in careful */
-    /*      assembly, so that the loader itself should work */
-    /*      regardless of where it is loaded. */
-    /*   3. In qemu, we can load kernel.elf instead of raw binary */
-    /*      (qemu will do the right thing then) */
-
-    . = 0x8000;
-
-    /* RPi in 64-bit mode uses address 0x80000 instead */
-    
-    __start = .;
-    .kernel :
-    {
-        __kernel_start = .;
-	KEEP(boot.o)
-	. = ALIGN(4);
-	ramfs_embeddable.o
-	*(EXCLUDE_FILE (libkernel.o interrupt_vector.o interrupts.o) *)
-	__kernel_end = .;
-    }
-    __kernel_size = __kernel_end - __kernel_start;
-
-    .interrupts :
-    {
-	__interrupts_start = .;
-	KEEP(*(.interrupts.vector))
-	interrupt_vector.o
-	interrupts.o
-    	__interrupts_end = .;
-    }
-    __interrupts_size = __interrupts_end - __interrupts_start;
-    
-    __end = .;
-
-    _stack_top = __start;
-}
diff --git a/memory.h b/memory.h
index e4493e2..adc3bc0 100644
--- a/memory.h
+++ b/memory.h
@@ -1,7 +1,10 @@
 #ifndef MEMORY_H
 #define MEMORY_H
 
-#define POWER_OF_2(EXP) (((uint32_t) 1) << EXP)
+// These macros were heavily used b4 I moved all the address
+// computation to the linker script. Now I'm just keeping them
+// in case they're needed for something else :)
+#define POWER_OF_2(EXP) (((size_t) 1) << EXP)
 
 #define ALIGN_POWER_OF_2(ADDR, EXP)				\
   (((ADDR - 1) & ~(POWER_OF_2(EXP) - 1)) + POWER_OF_2(EXP))
@@ -10,33 +13,45 @@
 
 #define ALIGN_SECTION(ADDR) ALIGN_POWER_OF_2(ADDR, 20)
 
-#define INTERRUPT_VECTOR_TABLE_START ((uint32_t) 0x0)
 
-#define STACK_START ((uint32_t) 0x4000)
-#define STACK_END   ((uint32_t) 0x8000)
+// memory layout
+
+#define INTERRUPT_VECTOR_TABLE_START ((uint32_t) 0x0)
 
+// all those symbols are defined in the linker script
 extern const char __end;
 extern const char __start;
+extern const char _translation_table_start;
+extern const char _translation_table_end;
+extern const char _stack_start;
+extern const char _stack_top;
+extern const char _unprivileged_memory_start;
+extern const char _unprivileged_memory_end;
 
-#define KERNEL_START ((uint32_t) &__start) // this is 0x8000
-#define KERNEL_END   ((uint32_t) &__end)
+#define KERNEL_START ((size_t) &__start) // this is 0x0
+#define KERNEL_END   ((size_t) &__end)
 
 // first 2^14 aligned address after the kernel
-#define TRANSLATION_TABLE_BASE ALIGN_POWER_OF_2(KERNEL_END, 14)
-
-#define TRANSLATION_TABLE_END				\
-  (TRANSLATION_TABLE_BASE + (uint32_t) (4096 * 4))
+#define TRANSLATION_TABLE_BASE ((size_t) &_translation_table_start)
+#define TRANSLATION_TABLE_END  ((size_t) &_translation_table_end)
 
-#define PRIVILEGED_MEMORY_END ALIGN_SECTION(TRANSLATION_TABLE_END)
+// first section after the translation table is left unused;
+// the next section is used as the stack
+#define STACK_START ((size_t) &_stack_start)
+#define STACK_END   ((size_t) &_stack_top)
 
-#define UNPRIVILEGED_MEMORY_START PRIVILEGED_MEMORY_END
+#define PRIVILEGED_MEMORY_END STACK_END
 
+#define UNPRIVILEGED_MEMORY_START				\
+  ((size_t) &_unprivileged_memory_start)  // equal to STACK_END
 #define UNPRIVILEGED_MEMORY_END			\
-  (UNPRIVILEGED_MEMORY_START + SECTION_SIZE)
+  ((size_t) &_unprivileged_memory_end)
 
-#define PL0_SECTION_NUMBER ((uint32_t) 0b101010101010)
+#define PL0_SECTION_NUMBER ((size_t) 0xaaa)
 
 #define VIRTUAL_PL0_MEMORY_START (PL0_SECTION_NUMBER << 20)
+#define VIRTUAL_PL0_MEMORY_END			\
+  (VIRTUAL_PL0_MEMORY_START + SECTION_SIZE)
 
 #endif // MEMORY_H
 
diff --git a/kernel.c b/setup.c
index 719ceff..48df825 100644
--- a/kernel.c
+++ b/setup.c
@@ -2,7 +2,7 @@
 #include "demo_functionality.h"
 #include "paging.h"
 
-void kernel_main(void)
+void setup(void)
 {
   uart_init();
 
@@ -22,16 +22,13 @@ void kernel_main(void)
   // prints some info and sets upp translation table, turns on MMU
   setup_flat_map();
 
+  // prints some info and sets up a section for PL0 code,
+  // loads a blob there 
   demo_setup_PL0();
 
-  demo_setup_interrupts();
-  
-  // prints some info and sets up a section for PL0 code, loads a blob
-  // there and jumps to it... never, ever, ever returns
+  // jumps to unprivileged code... never, ever, ever returns
   demo_go_unprivileged();
   
-  while(1);
-  
   while (1)
     {
       char c = uart_getc();
author	Wojtek Kosior <kwojtus@protonmail.com>	2019-12-28 21:54:42 +0100
committer	Wojtek Kosior <kwojtus@protonmail.com>	2019-12-28 21:54:42 +0100
commit	700f4c412d42c9b9811269045c0e363a0331bba9 (patch)
tree	260feed1ca657843d993c1ae73e93f25a17cede1
parent	80c9af17330ac442a4c3d6d55b4041cbe923e9b4 (diff)
download	rpi-MMU-example-700f4c412d42c9b9811269045c0e363a0331bba9.tar.gz rpi-MMU-example-700f4c412d42c9b9811269045c0e363a0331bba9.zip