rpi-MMU-example - Simple kernel to present usage of Memory Management Unit, under Raspberry Pi 3B running libre firmware

/* arm mode, cortex-a7 compatibility
 *
 * _boot is entry point for the kernel.
 *
 * Kernel copies it's embedded stage 2 to address 0x0 and jumps to
 * it (to the reset handler). Registers r0 - r2 are arguments for
 * the kernel, but we're not using them for now.
 *
 * This file is based on (and almost identical with) loader_stage1.S
 */

.global _boot
_boot:
        // Only let the first core execute
        mrc p15, 0, r3, c0, c0, 5
        and r3, r3, #3
        cmp r3, #0
        beq proceed
	// this is a kind of blef - races can theoretically still
	// occur when the main core overwrites this part of memory
	wfe

	// we'll use the size of stage1 to determine where we have free
	// space after it. We'll then copy our atags/fdt there, so
	// it doesn't get overwritten by stage2 we deploy at 0x0
atags_magic:
	.word 0x54410001
	
proceed:
	// load the second word of structure passed to us through r2;
	// if it's atags, it's second word should be the magic number
	// Btw, location of ATAGS is always 0x100.
	ldr r3, [r2, #4]
	adr r4, atags_magic
	ldr r4, [r4]

	// compare second word of assumed atags with magic number
	// to see, if it's really atags and not sth else (i.e. fdt)
	cmp r3, r4

	// normally at start r0 contains value 0;
	// value 3 in r0 would tell stage2 code, we found no atags :(
	movne r0, #3
	bne stage2_blob_copying

	// if atags was found, copying of it takes place here
	
	// the following loop finds, where atags ends
	// r3 shall point to currently looked-at tag
	mov r3, r2

find_end_of_atags_loop:
	// load first word of tag header to r4 (it contains tag size)
	ldr r4, [r3]
	// make r3 point at the next tag (by adding 4*tag_size to it)
	add r3, r4, lsl #2

	// load second word of tag header to r5 (it contains tag type)
	ldr r5, [r3, #4]

	// if tag value is 0, it is the last tag
	cmp r5, #0
	bne find_end_of_atags_loop

	add r3, #8 // make r3 point at the end of last tag
	sub r3, r2 // get atags size in r3
	
	// at this pont r2 and r3 point at start and size of atags,
	// respectively; now we'll compute, where we're going to have
	// free space to put atags in; we want to put atags either
	// right after our blob or, if if it doesn't fit between
	// blob end and the address stage1 is loaded at, after stage1

	// get blob size to r5
	adr r5, blob_size
	ldr r5, [r5]

	// we could only copy atags to a 4-aligned address
	mov r6, #4
	bl aling_r5_to_r6

	// compute where atags copied right after blob would end
	add r6, r5, r3
	// we can only overwrite stuff before the copying loop
	adr r7, copy_atags_loop
	cmp r6, r7
	ble copy_atags

	// atags wouldn't fit - use memory after stage1 as destination
	adr r5, _boot
	adr r6, stage1_size
	ldr r6, [r6]
	add r5, r6
	mov r6, #4
	bl aling_r5_to_r6
	
copy_atags:
	// now copy atags (r2 - atags start; r3 - atags size;
	// r5 - destination; r4 - iterator; r6 - buffor)
	mov r4, #0

copy_atags_loop:
	ldr r6, [r2, r4]
	str r6, [r5, r4]
	add r4, #4
	cmp r4, r3
	blo copy_atags_loop

	mov r2, r5 // place the new atags address in r2
	b stage2_blob_copying // atags stuff done; proceed

// mini-function, that does what the label says; clobbers r7
aling_r5_to_r6:
	sub r5, #1
	sub r7, r6, #1
	bic r5, r7
	add r5, r6
	mov pc, lr

	
stage2_blob_copying: // copy stage2 of the kernel to address 0x0

	// first, load address of stage2_start to r3 (a PIC way)
	adr r3, stage2_start

	// load destination address for stage2 code to r4
	mov r4, #0

	// load blob size to r5
	// The size might get too big for an immediate value, so
	// we load it from memory.
	adr r5, blob_size
	ldr r5, [r5]

	// r6 is the counter - counts the bytes copied
	mov r6, #0

	// This initial piece of code might get overwritten when we
	// copy stage2, so the actual copying loop shall be after
	// stage2 blob. We want this asm code to be PIC, so we're
	// computing address of stage2_end into r7.
	add r7, r3, r5
	bx r7

blob_size:
	.word stage2_end - stage2_start
stage1_size:
	.word stage1_end - _boot

.align 4
stage2_start:
	.incbin "kernel_stage2.img"
stage2_end:

	// each word of the blob is loaded to r7 and stored
	// from r7 to it's destination in a loop
loop:
	ldr r7, [r3, r6]
	str r7, [r4, r6]
	add r6, r6, #4
	cmp r6, r5
	blo loop

        // Call stage2 of the kernel (branch to 0x0,
	// which is the reset handler).
        bx r4
	
stage1_end: