欢迎来到尧图网

客户服务 关于我们

您的位置:首页 > 新闻 > 社会 > Linux 5.0在start_kernel之前做了什么事?(以aarch64为例)

Linux 5.0在start_kernel之前做了什么事?(以aarch64为例)

2025/11/6 7:30:44 来源:https://blog.csdn.net/qq_48322523/article/details/142427891  浏览:    关键词:Linux 5.0在start_kernel之前做了什么事?(以aarch64为例)

目录

  • 引言
  • 汇编启动!!!
  • 细节剖析

引言

之前在研究Linux内核源码的时候总是找不到关于这部分源码的相关剖析,要么也是模棱两可的,也有一些比较专业的代码分析,不过比较分散,感觉大家都不太喜欢这部分代码,正好今天周末,这段时间也在学习Arm64汇编,以这部分为研究对象来解析

源码版本:Linux 5.0

架构信息

  • 芯片架构:ARM64
  • 内存架构:UMA
  • CONFIG_ARM64_VA_BITS:39
  • CONFIG_ARM64_PAGE_SHIFT:12
  • CONFIG_PGTABLE_LEVELS:3

之前写的一些相关文章:

Linux内存管理:Bootmem的率先登场

Linux内存管理:Buddy System姗姗来迟

Linux内存管理:Slab闪亮登场

Linux内存管理:内存分配和内存回收原理

Linux CFS调度器:原理和实现

汇编启动!!!

Linux内核代码从哪里执行的?从链接脚本看

# arch/arm64/kernel/vmlinux.lds.SSECTIONS
{. = KIMAGE_VADDR + TEXT_OFFSET;.head.text : {_text = .;HEAD_TEXT}
// include/linux/init.h#define __HEAD		.section	".head.text","ax"
// arch/arm64/kernel/head.S__HEAD
_head:/** DO NOT MODIFY. Image header expected by Linux boot-loaders.*/b	stext				// branch to kernel start, magic

KIMAGE_VADDR vmalloc区域的起始地址,TEXT_OFFSET是内核起始地址距离ram起始地址的偏移(每一版的Linux内核内存架构都有点不同,此处不做纠结)

// arch/arm64/include/asm/memory.h#define VA_BITS			(39)
#define VA_START		(UL(0xffffffffffffffff) - \(UL(1) << VA_BITS) + 1)
#define PAGE_OFFSET		(UL(0xffffffffffffffff) - \(UL(1) << (VA_BITS - 1)) + 1)
#define KIMAGE_VADDR		(MODULES_END)
#define BPF_JIT_REGION_START	(VA_START + KASAN_SHADOW_SIZE)
#define BPF_JIT_REGION_SIZE	(SZ_128M)
#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
#define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR		(BPF_JIT_REGION_END)
#define MODULES_VSIZE		(SZ_128M)
#define VMEMMAP_START		(PAGE_OFFSET - VMEMMAP_SIZE)
#define PCI_IO_END		(VMEMMAP_START - SZ_2M)
#define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP		(PCI_IO_START - SZ_2M)

在这里插入图片描述
这个Linear Mapping区域位置不固定,有时候会在VM_START

上述跳转到stext符号,从这里才正式开始

ENTRY(stext)bl	preserve_boot_argsbl	el2_setup			// Drop to EL1, w0=cpu_boot_modeadrp	x23, __PHYS_OFFSETand	x23, x23, MIN_KIMG_ALIGN - 1	// KASLR offset, defaults to 0bl	set_cpu_boot_mode_flagbl	/** The following calls CPU setup code, see arch/arm64/mm/proc.S for* details.* On return, the CPU will be ready for the MMU to be turned on and* the TCR will have been set.*/bl	__cpu_setup			// initialise processorb	__primary_switch
ENDPROC(stext)

首先是preserve_boot_args符号

x20存储的是FDT设备树文件的物理地址,将其传递给x21

/** Preserve the arguments passed by the bootloader in x0 .. x3*/
preserve_boot_args:mov	x21, x0				// x21=FDTadr_l	x0, boot_args			// record the contents ofstp	x21, x1, [x0]			// x0 .. x3 at kernel entrystp	x2, x3, [x0, #16]dmb	sy				// needed before dc ivac with// MMU offmov	x1, #0x20			// 4 x 8 bytesb	__inval_dcache_area		// tail call
ENDPROC(preserve_boot_args)

x21 x1 x2 x3四个寄存器的值存入boot_args数组

// arch/arm64/kernel/setup.c/** The recorded values of x0 .. x3 upon kernel entry.*/
u64 __cacheline_aligned boot_args[4];

__inval_dcache_area用于清理32字节的数据缓存,x0boot_args的地址,x132字节,即数组的四个元素(这部分功能代码,在末尾进行解析)

接下来是el2_setup

ENTRY(el2_setup)msr	SPsel, #1			// We want to use SP_EL{1,2}mrs	x0, CurrentELcmp	x0, #CurrentEL_EL2b.eq	1fmov_q	x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)msr	sctlr_el1, x0mov	w0, #BOOT_CPU_MODE_EL1		// This cpu booted in EL1isbret

SPsel用于在SP_EL0SP_ELn中选择SP寄存器,此处选择使用当前特权级SP寄存器

CurrentEL用于获取当前运行级别,并与CurrentEL_EL2进行比较,我们假设此处不使用虚拟机,而是使用内核级级别

sctlr_el1为系统控制寄存器,此处用于设置小端模式(如下,默认是小端)

// arch/arm64/include/asm/sysreg.h#define SCTLR_EL1_RES1	((_BITUL(11)) | (_BITUL(20)) | (_BITUL(22)) | (_BITUL(28)) | \(_BITUL(29)))#ifdef CONFIG_CPU_BIG_ENDIAN
#define ENDIAN_SET_EL1		(SCTLR_EL1_E0E | SCTLR_ELx_EE)
#define ENDIAN_CLEAR_EL1	0
#else
#define ENDIAN_SET_EL1		0
#define ENDIAN_CLEAR_EL1	(SCTLR_EL1_E0E | SCTLR_ELx_EE)
#endif

BOOT_CPU_MODE_EL1暂时不知道啥用处,只会通过返回存放在w0寄存器,放在这里

// arch/arm64/include/asm/virt.h#define BOOT_CPU_MODE_EL1	(0xe11)

kaslr假设不开启,此处略过

直接看set_cpu_boot_mode_flag

/** Sets the __boot_cpu_mode flag depending on the CPU boot mode passed* in w0. See arch/arm64/include/asm/virt.h for more info.*/
set_cpu_boot_mode_flag:adr_l	x1, __boot_cpu_modecmp	w0, #BOOT_CPU_MODE_EL2b.ne	1fadd	x1, x1, #4
1:	str	w0, [x1]			// This CPU has booted in EL1dmb	sydc	ivac, x1			// Invalidate potentially stale cache lineret
ENDPROC(set_cpu_boot_mode_flag)

__boot_cpu_mode是一个整数数组

ENTRY(__boot_cpu_mode).long	BOOT_CPU_MODE_EL2.long	BOOT_CPU_MODE_EL1
// arch/arm64/include/asm/virt.hextern u32 __boot_cpu_mode[2];

前面的w0存储的是BOOT_CPU_MODE_EL1,由此处可知:This CPU has booted in EL1,跳转到1标签

此处让__boot_cpu_mode[0]等于BOOT_CPU_MODE_EL1,并且清理此处缓存

然后是__cpu_setup,先是清理tlb缓存

	.pushsection ".idmap.text", "awx"
ENTRY(__cpu_setup)tlbi	vmalle1				// Invalidate local TLBdsb	nsh

cpacr_el1用于控制对浮点数simd的访问:捕获访问与浮点和SIMD执行相关的寄存器的指令,以便在从EL0EL1执行时捕获到EL1

mdscr_el1(Monitor Debug System Control Register)debug功能不做概述

	mov	x0, #3 << 20msr	cpacr_el1, x0			// Enable FP/ASIMDmov	x0, #1 << 12			// Reset mdscr_el1 and disablemsr	mdscr_el1, x0			// access to the DCC from EL0isb					// Unmask debug exceptions now,enable_dbg				// since this is per-cpureset_pmuserenr_el0 x0			// Disable PMU access from EL0

mair_el1用于控制存储器属性的编码:分为八段,用于描述不同的内存属性,后续会在页表中使用AttrIndx[2:0]进行索引

ARMv8最多可以定义八种不同的内存属性,而Linux内核只定义了六种

	ldr	x5, =MAIR(0x00, MT_DEVICE_nGnRnE) | \MAIR(0x04, MT_DEVICE_nGnRE) | \MAIR(0x0c, MT_DEVICE_GRE) | \MAIR(0x44, MT_NORMAL_NC) | \MAIR(0xff, MT_NORMAL) | \MAIR(0xbb, MT_NORMAL_WT)msr	mair_el1, x5

TCR寄存器主要包括了与地址转换相关的控制信息以及与高速缓存相关的配置信息

	/** Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for* both user and kernel.*/ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGSldr_l		x9, idmap_t0sztcr_set_t0sz	x10, x9/** Set the IPS bits in TCR_EL1.*/tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6msr	tcr_el1, x10ret					// return to head.S
ENDPROC(__cpu_setup)

这部分是初始化内存部分

最重要的是__primary_switch

__primary_switch:adrp	x1, init_pg_dirbl	__enable_mmuldr	x8, =__primary_switchedadrp	x0, __PHYS_OFFSETbr	x8
ENDPROC(__primary_switch)

__enable_mmu这个看名词即知,用于开启mmu

__create_page_tables 用于创建两个页表:init_pg_diridmap_pg_dir

在执行之前先关闭mmu

	msr	sctlr_el1, x20			// disable the MMUisbbl	__create_page_tables		// recreate kernel mapping

清除init_pg_dir页表缓存

__create_page_tables:mov	x28, lr/** Invalidate the init page tables to avoid potential dirty cache lines* being evicted. Other page tables are allocated in rodata as part of* the kernel image, and thus are clean to the PoC per the boot* protocol.*/adrp	x0, init_pg_dir // adrp获取的是物理地址adrp	x1, init_pg_endsub	x1, x1, x0bl	__inval_dcache_area // 清除缓存

init_pg_dir页表内存重置为xzr

	adrp	x0, init_pg_diradrp	x1, init_pg_endsub	x1, x1, x0 // x1为init_pg_dir占用的字节数
1:	stp	xzr, xzr, [x0], #16stp	xzr, xzr, [x0], #16stp	xzr, xzr, [x0], #16stp	xzr, xzr, [x0], #16subs	x1, x1, #64 // 一次清理64Bb.ne	1b

vabits_user用于保存虚拟地址位数

	/** Create the identity mapping. 恒等映射*/adrp	x0, idmap_pg_dir // idmap_pg_dir的物理地址adrp	x3, __idmap_text_start		// __pa(__idmap_text_start)mov	x5, #VA_BITS
1:adr_l	x6, vabits_userstr	x5, [x6]dmb	sydc	ivac, x6		// Invalidate potentially stale cache line

idmap_ptrs_per_pgd用于获得PGD(idmap_pg_dir)PGD表项数

PGDIR_SHIFTPGD的偏移位数

	/** If VA_BITS == 48, we don't have to configure an additional* translation level, but the top-level table has more entries.*/mov	x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)str_l	x4, idmap_ptrs_per_pgd, x5

创建并更新idmap_pg_dir页表

	ldr_l	x4, idmap_ptrs_per_pgdmov	x5, x3				// __pa(__idmap_text_start)adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)// 创建各个页表map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14

这部分空间是什么?

# arch/arm64/kernel/vmlinux.lds.S#define IDMAP_TEXT					\. = ALIGN(SZ_4K);				\__idmap_text_start = .;				\*(.idmap.text)					\__idmap_text_end = .;

其中.idmap.text如下

pushsection    .idmap.text, "awx"//
.popsection

使用map_memory创建页表并映射,map_memory宏定义如下

/** Map memory for specified virtual address range. Each level of page table needed supports* multiple entries. If a level requires n entries the next page table level is assumed to be* formed from n pages.**	tbl:	location of page table*	rtbl:	address to be used for first level page table entry (typically tbl + PAGE_SIZE)*	vstart:	start address to map*	vend:	end address to map - we map [vstart, vend]*	flags:	flags to use to map last level entries*	phys:	physical address corresponding to vstart - physical memory is contiguous*	pgds:	the number of pgd entries** Temporaries:	istart, iend, tmp, count, sv - these need to be different registers* Preserves:	vstart, vend, flags* Corrupts:	tbl, rtbl, istart, iend, tmp, count, sv*/.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, svadd \rtbl, \tbl, #PAGE_SIZEmov \sv, \rtblmov \count, #0compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \countpopulate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmpmov \tbl, \svmov \sv, \rtbl
#if SWAPPER_PGTABLE_LEVELS > 2compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \countpopulate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmpmov \tbl, \sv
#endifcompute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \countbic \count, \phys, #SWAPPER_BLOCK_SIZE - 1populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp.endm

创建并更新init_pg_dir表项

	/** Map the kernel image (starting with PHYS_OFFSET).*/adrp	x0, init_pg_dirmov_q	x5, KIMAGE_VADDR + TEXT_OFFSET	// compile time __va(_text)add	x5, x5, x23			// add KASLR displacementmov	x4, PTRS_PER_PGDadrp	x6, _end			// runtime __pa(_end)adrp	x3, _text			// runtime __pa(_text)sub	x6, x6, x3			// _end - _textadd	x6, x6, x5			// runtime __va(_end)map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14

由上面可知,前面已经建立了恒等映射和内核映射,下面开启MMU,并执行到__primary_switched

	msr	sctlr_el1, x19			// re-enable the MMUisbic	iallu				// flush instructions fetcheddsb	nsh				// via old mappingisbldr	x8, =__primary_switchedadrp	x0, __PHYS_OFFSETbr	x8
ENDPROC(__primary_switch)
__primary_switched:adrp	x4, init_thread_unionadd	sp, x4, #THREAD_SIZEadr_l	x5, init_taskmsr	sp_el0, x5			// Save thread_info// 设置异常向量表adr_l	x8, vectors			// load VBAR_EL1 with virtualmsr	vbar_el1, x8			// vector table addressisbstp	xzr, x30, [sp, #-16]!mov	x29, spstr_l	x21, __fdt_pointer, x5		// Save FDT pointerldr_l	x4, kimage_vaddr		// Save the offset betweensub	x4, x4, x0			// the kernel virtual andstr_l	x4, kimage_voffset, x5		// physical mappings// Clear BSSadr_l	x0, __bss_startmov	x1, xzradr_l	x2, __bss_stopsub	x2, x2, x0bl	__pi_memsetdsb	ishst				// Make zero page visible to PTWadd	sp, sp, #16mov	x29, #0mov	x30, #0b	start_kernel
ENDPROC(__primary_switched)

init_thread_union存放了init栈的起始地址,如下__start_init_task = init_thread_union = init_stack,并将其add sp, x4, #THREAD_SIZE赋值为sp寄存器,并将init_task进程描述符存储到sp_el0寄存器

// include/asm-generic/vmlinux.lds.h#define INIT_TASK_DATA(align)						\. = ALIGN(align);						\__start_init_task = .;						\init_thread_union = .;						\init_stack = .;							\KEEP(*(.data..init_task))					\KEEP(*(.data..init_thread_info))				\. = __start_init_task + THREAD_SIZE;				\__end_init_task = .;
// init/init_task.c/** Set up the first task table, touch at your own risk!. Base=0,* limit=0x1fffff (=2MB)*/
struct task_struct init_task
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK__init_task_data
#endif
= {// 
};
EXPORT_SYMBOL(init_task);

在这里插入图片描述
设置异常向量表(vectors在后续会进行剖析)

	adr_l	x8, vectors			// load VBAR_EL1 with virtualmsr	vbar_el1, x8			// vector table addressisb

FDT物理地址(刚开始的时候将其地址存入x21)存入__fdt_pointer

	str_l	x21, __fdt_pointer, x5		// Save FDT pointer

保存kimage_vaddr,这个地址是kernel的虚拟地址,x0是内核被加载的物理地址

	ldr_l	x4, kimage_vaddr		// Save the offset betweensub	x4, x4, x0			// the kernel virtual andstr_l	x4, kimage_voffset, x5		// physical mappings

最后是清理bss段位执行内核函数做准备

跳转到start_kernel执行

细节剖析

map_memory宏定义

由上面idmap_pg_dir页表创建可知,

寄存器地址
x0idmap_pg_dir
x3__idmap_text_start
x6__idmap_text_end
x7SWAPPER_MM_MMUFLAGS
x4idmap_ptrs_per_pgd
	map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14

可知,PGD页表占用一个页:PAGE_SIZEtbl用于存储下一级页表的基址

	.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, svadd \rtbl, \tbl, #PAGE_SIZEmov \sv, \rtblmov \count, #0

compute_indices宏的功能:用于计算虚拟地址计算各级页表的索引值

	compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count

populate_entries宏的功能:填充索引值index对应的页表项

此处用于设置PGD PUD PMD页表项,此处不会设置PTE,使用段映射,一般是2MB

// arch/arm64/include/asm/pgtable-hwdef.h/* Initial memory map size */
#if ARM64_SWAPPER_USES_SECTION_MAPS
#define SWAPPER_BLOCK_SHIFT	SECTION_SHIFT
#define SWAPPER_BLOCK_SIZE	SECTION_SIZE
#define SWAPPER_TABLE_SHIFT	PUD_SHIFT
#else
#define SWAPPER_BLOCK_SHIFT	PAGE_SHIFT
#define SWAPPER_BLOCK_SIZE	PAGE_SIZE
#define SWAPPER_TABLE_SHIFT	PMD_SHIFT
#endif
	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp

是怎么切换页表的?每一次切换一级页表

	mov \tbl, \svmov \sv, \rtbl

来看看具体的宏定义

	.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, countlsr	\iend, \vend, \shift // 计算结束PGD表项mov	\istart, \ptrssub	\istart, \istart, #1 // 获得页表最大索引and	\iend, \iend, \istart	// iend = (vend >> shift) & (ptrs - 1) 将iend限制在最大范围内mov	\istart, \ptrsmul	\istart, \istart, \countadd	\iend, \iend, \istart	// iend += (count - 1) * ptrs// our entries span multiple tableslsr	\istart, \vstart, \shiftmov	\count, \ptrssub	\count, \count, #1and	\istart, \istart, \countsub	\count, \iend, \istart.endm

populate_entries下次补上

__inval_dcache_area宏定义

dc指令用于控制数据缓存

  • civacPoC,清理并使指定的虚拟地址对应的高速缓存失效
  • ivacPoC,使指定的虚拟地址中对于的高速缓存失效
ENTRY(__inval_dcache_area)/* FALLTHROUGH *//**	__dma_inv_area(start, size)*	- start   - virtual start address of region x0*	- size    - size in question x1*/
__dma_inv_area:add	x1, x1, x0 // x1=x0(start)+x1(size) 结束地址enddcache_line_size x2, x3 // x2为缓存行大小sub	x3, x2, #1tst	x1, x3				// end cache line aligned? 缓存行是否对齐bic	x1, x1, x3 // 不对齐则清除为0,这会提前地址,而不会跳过规定的起始范围b.eq	1fdc	civac, x1			// clean & invalidate D / U line
1:	tst	x0, x3				// start cache line aligned?bic	x0, x0, x3b.eq	2fdc	civac, x0			// clean & invalidate D / U lineb	3f
2:	dc	ivac, x0			// invalidate D / U line
3:	add	x0, x0, x2cmp	x0, x1b.lo	2bdsb	syret
ENDPIPROC(__inval_dcache_area)

vectors异常向量表

ARM64中的中断向量表占用2048B,分为四组,每组四个表项,每表项占用128B,四组分别是:

  • EL1t:在EL1下,与当前栈指针SP_ELx不同(一般是SP_EL0
  • EL1t:在EL1下,与当前栈指针SP_ELx相同(即SP_EL1
  • 从低异常级EL0进入当前异常级EL1Lower EL, AArch64
  • 从低异常级EL0进入当前异常级EL1Lower EL, AArch32
/** Exception vectors.*/.pushsection ".entry.text", "ax".align	11
ENTRY(vectors)kernel_ventry	1, sync_invalid			// Synchronous EL1tkernel_ventry	1, irq_invalid			// IRQ EL1tkernel_ventry	1, fiq_invalid			// FIQ EL1tkernel_ventry	1, error_invalid		// Error EL1tkernel_ventry	1, sync				// Synchronous EL1hkernel_ventry	1, irq				// IRQ EL1hkernel_ventry	1, fiq_invalid			// FIQ EL1hkernel_ventry	1, error			// Error EL1hkernel_ventry	0, sync				// Synchronous 64-bit EL0kernel_ventry	0, irq				// IRQ 64-bit EL0kernel_ventry	0, fiq_invalid			// FIQ 64-bit EL0kernel_ventry	0, error			// Error 64-bit EL0kernel_ventry	0, sync_invalid, 32		// Synchronous 32-bit EL0kernel_ventry	0, irq_invalid, 32		// IRQ 32-bit EL0kernel_ventry	0, fiq_invalid, 32		// FIQ 32-bit EL0kernel_ventry	0, error_invalid, 32		// Error 32-bit EL0
END(vectors)

每四个异常分别对应于

#define BAD_SYNC	0
#define BAD_IRQ		1
#define BAD_FIQ		2
#define BAD_ERROR	3

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com

热搜词