5.3 设置每CPU环境
回到start_kernel,563行调用mm_init_owner函数,将init_mm的owner字段指回init_task。这个函数可以说进入start_kernel以来最简单的函数了。继续走,setup_command_line也很简单:
static void __init setup_command_line(char *command_line) { saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); static_command_line = alloc_bootmem(strlen (command_line)+1); strcpy (saved_command_line, boot_command_line); strcpy (static_command_line, command_line); } |
把刚才在setup_arch()中拷贝进来的command_line,拷贝到全局变量saved_command_line和static_command_line所指向的内存单元中。这个内存单元通过alloc_bootmem函数在刚刚建立好的内存管理环境中进行分配。
继续走,565行,setup_nr_cpu_ids()函数,在多CPU情况下,调用同一文件中的:
static void __init setup_nr_cpu_ids(void)
{
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
nr_cpu_ids是一个特殊的值,在单CPU情况下是1;而SMP情况下,又是一个全局变量,被find_last_bit函数设置,针对x86体系其本质上会调用bsr汇编指令。这里我大概介绍一下这个指令的概念,可能有不对的地方,请高手指教。386以上的CPU有一对指令BSF/BSR ——正/反向位扫描。这个指令的使用方法是:BSF dest,src,影响标志位ZF。这个指令的意思是,扫描源操作数中的第一个被设置的位,如果发现某一位被设置了,则设置ZF位并将第一个被设置位的索引装载到目的操作数中;如果没有发现被设置的位,则清除ZF。BSF正向扫描各个位(从第0位到第N位),BSR相反(从第N位到第0位)。
继续走,566行,setup_per_cpu_areas,来自arch/x86/kernel/setup_percpu.c。这个函数只是设置一下SMP的每CPU存储区,也就是说为系统中的每个cpu的per_cpu变量申请空间。函数比较复杂,我这里只把整个函数列出来,对SMP感兴趣的同学可以尝试深入分析一下:
void __init setup_per_cpu_areas(void) { unsigned int cpu; unsigned long delta; int rc;
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d/n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/* * Allocate percpu area. Embedding allocator is our favorite; * however, on NUMA configurations, it can result in very * sparse unit mapping and vmalloc area isn't spacious enough * on 32bit. Use page in that case. */ #ifdef CONFIG_X86_32 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) pcpu_chosen_fc = PCPU_FC_PAGE; #endif rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_PAGE) { const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) pr_warning("%s allocator failed (%d), falling back to page size/n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc);
/* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); setup_stack_canary_segment(cpu); /* * Copy data used in early init routines from the * initial arrays to the per cpu data areas. These * arrays then become expendable and the *_early_ptr's * are zeroed indicating that the static arrays are * gone. */ #ifdef CONFIG_X86_LOCAL_APIC per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); #endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); #endif #endif /* * Up to this point, the boot CPU has been using .data.init * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); }
/* indicate the early static arrays will soon be gone */ #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) /* * make sure boot cpu node_number is right, when boot cpu is on the * node that doesn't have mem installed */ per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); #endif
/* Setup node to cpumask map */ setup_node_to_cpumask_map();
/* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); } |
在该函数中,为每个CPU分配一段专有数据区,并将.data.percpu中的数据拷贝到其中, 每个CPU各有一份。由于数据从__per_cpu_start处转移到各CPU自己的专有数据区中了, 因此存取其中的变量就不能再用原先的值了,比如存取per_cpu__runqueues 就不能再用per_cpu__runqueues了,需要做一个偏移量的调整,即需要加上各CPU自己的专有数据区首地址相对于__per_cpu_start的偏移量。在这里也就是__per_cpu_offset[i],其中CPU i的专有数据区相对于__per_cpu_start的偏移量为__per_cpu_offset[i]。
这样,就可以方便地计算专有数据区中各变量的新地址,比如对于per_cpu_runqueues, 其新地址即变成per_cpu_runqueues+__per_cpu_offset[i]。
经过这样的处理,.data.percpu这个section在系统初始化后就可以释放了。为什么要释放它?OK,自己去看arch/x86/kernel/vmlinux.lds文件,整个.data.percpu这个section都在__init_begin和__init_end之间,也就是说,该section所占内存会在系统启动后释放(free)掉。
继续走,start_kernel的567行smp_prepare_boot_cpu函数,来自arch/x86/include/asm/smp.h:
static inline void smp_prepare_boot_cpu(void)
{
smp_ops.smp_prepare_boot_cpu();
}
全局变量smp_ops也是一个smp_ops结构,在代码arch/x86/kernel/smp.c中被初始化成:
struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done,
.smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up, .cpu_die = native_cpu_die, .cpu_disable = native_cpu_disable, .play_dead = native_play_dead,
.send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; |
所以,567行smp_prepare_boot_cpu函数最终调用native_smp_prepare_boot_cpu函数。该函数最终会调用switch_to_new_gdt函数,传给他的参数是当前CPU的编号:
void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr;
gdt_descr.address = (long)get_cpu_gdt_table(cpu); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); /* Reload the per-cpu base */
load_percpu_segment(cpu); } |