衆所周知,Linux內存管理的核心是夥伴系統(buddy system)。其實在linux啓動的那一刻,內存管理就已經開始了,只不過不是buddy在管理。在內核中,實現物理內存管理的allocator包括:
-
連續物理內存管理buddy allocator
-
非連續物理內存管理vmalloc allocator
-
小塊物理內存管理slab allocator
-
高端物理內存管理kmapper
-
初始化階段物理內存管理memblock
在系統初始化階段會先啓用一個bootmem分配器和memblock分配器,此分配器是專門用於啓動階段的,一個bootmem分配器管理着一個node結點的所有內存,也就是在numa架構中多個node有多個bootmem,他們被鏈入bdata_list鏈表中保存。而夥伴系統的初始化就是將bootmem管理的所有物理頁框釋放到夥伴系統中去,本章的主要是分析下,如何實現bootmem到buddy的過度的整個流程。
1. 由mem_init開始
void __init mem_init(void)
{
#ifdef CONFIG_HAVE_TCM
/* These pointers are filled in on TCM detection */
extern u32 dtcm_end;
extern u32 itcm_end;
#endif
set_max_mapnr(pfn_to_page(max_pfn) - mem_map); --------------(1)
/* this will put all unused low memory onto the freelists */
free_unused_memmap(); --------------(2)
free_all_bootmem(); --------------(3)
#ifdef CONFIG_SA1111
/* now that our DMA memory is actually so designated, we can free it */
free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL);
#endif
free_highpages(); --------------(4)
mem_init_print_info(NULL); --------------(5)
#define MLK(b, t) b, t, ((t) - (b)) >> 10
#define MLM(b, t) b, t, ((t) - (b)) >> 20
#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
pr_notice("Virtual kernel memory layout:\n"
" vector : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HAVE_TCM
" DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
" ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
#ifdef CONFIG_MODULES
" modules : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
" .text : 0x%p" " - 0x%p" " (%4td kB)\n"
" .init : 0x%p" " - 0x%p" " (%4td kB)\n"
" .data : 0x%p" " - 0x%p" " (%4td kB)\n"
" .bss : 0x%p" " - 0x%p" " (%4td kB)\n",
MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
(PAGE_SIZE)),
#ifdef CONFIG_HAVE_TCM
MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
MLK(ITCM_OFFSET, (unsigned long) itcm_end),
#endif
MLK(FIXADDR_START, FIXADDR_END),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
#ifdef CONFIG_HIGHMEM
MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
(PAGE_SIZE)),
#endif
#ifdef CONFIG_MODULES
MLM(MODULES_VADDR, MODULES_END),
#endif
MLK_ROUNDUP(_text, _etext),
MLK_ROUNDUP(__init_begin, __init_end),
MLK_ROUNDUP(_sdata, _edata),
MLK_ROUNDUP(__bss_start, __bss_stop));
#undef MLK
#undef MLM
#undef MLK_ROUNDUP
/*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
*/
#ifdef CONFIG_MMU
BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
BUG_ON(TASK_SIZE > MODULES_VADDR);
#endif
#ifdef CONFIG_HIGHMEM
BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET);
BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET);
#endif
if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
extern int sysctl_overcommit_memory;
/*
* On a machine this small we won't get
* anywhere without overcommit, so turn
* it on by default.
*/
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}
-
函數set_max_mapnr()就是用於計算max_mapnr,實際指向實際物理內存大小
-
free_unused_memmap將物理上不存在的頁(hole)在頁管理位圖中全部記錄爲"不適用"。
-
在頁管理位圖中記錄爲"不使用"後,free_all_bootmem函數進行釋放,使其能夠在夥伴系統中管理空白頁。
-
free_highpages將高端內存區域釋放到夥伴系統,使其能管理空白頁
-
mem_init_print_info()是把內核映像的各個段地址打印出來,後面主要是將整個內核空間的虛擬映射空間打印出來,對於我們現在使用的開發板其打印信息如下
2. 空閒內存釋放
函數free_unused_memmap()和free_all_bootmem()都是把空閒內存釋放到夥伴系統,前者釋放memblock中空閒內存,後者釋放bootmem中內存。
static void __init free_unused_memmap(void)
{
unsigned long start, prev_end = 0;
struct memblock_region *reg;
/*
* This relies on each bank being in address order.
* The banks are sorted previously in bootmem_init().
*/
for_each_memblock(memory, reg) {
start = memblock_region_memory_base_pfn(reg);
#ifdef CONFIG_SPARSEMEM
start = min(start,
ALIGN(prev_end, PAGES_PER_SECTION));
#else
start = round_down(start, MAX_ORDER_NR_PAGES);
#endif
if (prev_end && prev_end < start)
free_memmap(prev_end, start);
prev_end = ALIGN(memblock_region_memory_end_pfn(reg),
MAX_ORDER_NR_PAGES);
}
#ifdef CONFIG_SPARSEMEM
if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
free_memmap(prev_end,
ALIGN(prev_end, PAGES_PER_SECTION));
#endif
}
該主要是獲得memblock的memory,對於IMX開發板,其reg爲0x8000000,得到對應的start爲0x80000,所以不滿足free_memap的條件,之後拿到的prev_end爲0xa0000,而對於該開發板只有一片內存,所以對於memblock中沒有相對應的空閒內存釋放。系統在分配內存節點的mem_map時是按照這個內存節點起始地址到末尾地址分配的,這個地址空間中可能有空洞,這個空洞地址對應的page數據結構是可以釋放掉,如下圖所示
下面我們來看看bootm的釋放,首先我們來看看bootmem的struct bootmem_data結構:
typedef struct bootmem_data {
unsigned long node_min_pfn;
unsigned long node_low_pfn;
void *node_bootmem_map;
unsigned long last_end_off;
unsigned long hint_idx;
struct list_head list;
} bootmem_data_t;
結構體成員 | 含義 |
---|---|
node_min_pfn | 此塊內存開始頁框號 |
node_low_pfn | 此塊內存結束頁框號,如果是32位系統下此保存的是 ZONE_NORMAL最後一個頁框號 |
node_bootmem_map | 指向位圖內存區,node中所有ZONE_HIGHMEM之前的頁框都在這裏面有一個位,每次需要分配內存時就會掃描找出一個空閒頁框,空洞的內存也會佔用位,不過空洞的內存應該設置爲已分配 |
last_end_off | 上次分配距離末尾的偏移量 |
hint_idx | |
list | 鏈入bdata_list結構鏈表 |
bootm分配器的核心就是node_bootmem_map這個位圖,每一位代表這個node的一個頁,當需要分配時,就會去掃描這個位圖,然後獲得一段物理頁框進行分配,一般都會從開始處向後分配。而夥伴系統初始化時會根據這個位圖,將位圖中空閒的頁釋放回夥伴系統,而已經分配出去的頁則不會在初始化階段釋放回夥伴系統,不過有可能在運行過程中釋放回夥伴系統。由於對於支持memblock的內核,內核配置了CONFIG_NO_BOOTMEM,其實現在mm/nobootmem.c,具體實現如下:
unsigned long __init free_all_bootmem(void)
{
unsigned long pages;
reset_all_zones_managed_pages(); ---------------(1)
pages = free_low_memory_core_early(); ---------------(2)
totalram_pages += pages;
return pages;
}
- 設置所有node的所有zone的managed_pages爲0,該函數只會啓動時候調用一次
- 遍歷所有需要釋放的啓動內存數據塊,釋放bdata啓動內存塊中所有頁框到頁框分配器中,計算所有的內存頁數據,存儲在totalram_pages中,並返回總共釋放的頁數量。
繼續看free_low_memory_core_early,其主要的實現如下所示
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end;
u64 i;
memblock_clear_hotplug(0, -1);
for_each_reserved_mem_region(i, &start, &end)
reserve_bootmem_region(start, end);
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL)
count += __free_memory_core(start, end);
return count;
}
-
遍歷memblock.reserved類型的regions,對每個regions設置頁面屬性爲Reserved,對於Imx,這個reserved區域爲
[root@qemu_imx6ul:~]# cat /sys/kernel/debug/memblock/reserved 0: 0x0000000080003000..0x0000000080007fff 1: 0x0000000080200000..0x00000000810e8eeb 2: 0x0000000088000000..0x0000000088014303 3: 0x000000008bad3000..0x000000008bb40fff 4: 0x000000008bb413c0..0x000000008bb433bf 5: 0x000000008bb433f4..0x000000008bffefff 6: 0x000000008bfff740..0x000000008bfff77b 7: 0x000000008bfff780..0x000000008bfff7bb 8: 0x000000008bfff7c0..0x000000008bfff837 9: 0x000000008bfff840..0x000000008bfff843 10: 0x000000008bfff880..0x000000008bfff883 11: 0x000000008bfff8c0..0x000000008bfff8c3 12: 0x000000008bfff900..0x000000008bfff903 13: 0x000000008bfff940..0x000000008bfff9a1 14: 0x000000008bfff9c0..0x000000008bfffa21 15: 0x000000008bfffa40..0x000000008bfffaa1 16: 0x000000008bfffaac..0x000000008bfffac6 17: 0x000000008bfffac8..0x000000008bfffae2 18: 0x000000008bfffae4..0x000000008bfffb5e 19: 0x000000008bfffb60..0x000000008bfffb7a 20: 0x000000008bfffb7c..0x000000008bfffb96 21: 0x000000008bfffb98..0x000000008bfffbb2 22: 0x000000008bfffbb4..0x000000008bfffbce 23: 0x000000008bfffbd0..0x000000008bfffbea 24: 0x000000008bfffbec..0x000000008bfffc06 25: 0x000000008bfffc08..0x000000008bfffc22 26: 0x000000008bfffc24..0x000000008bfffccc 27: 0x000000008bfffcd0..0x000000008bfffce8 28: 0x000000008bfffcec..0x000000008bfffd04 29: 0x000000008bfffd08..0x000000008bfffd20 30: 0x000000008bfffd24..0x000000008bfffd3c 31: 0x000000008bfffd40..0x000000008bfffd5c 32: 0x000000008bfffd60..0x000000008bfffd7c 33: 0x000000008bfffd80..0x000000008bfffdc7 34: 0x000000008bfffdd8..0x000000009fffffff
-
遍歷所有在memblock.memory中,但是不在memblock.reserve中的regions。然後清Reserved頁面屬性
下面重點看看頁面是如何完成reserved的配置,其代碼如下,主要是清空各頁的page->flags的PG_reserved位,將reserved的區域的頁標籤位爲PG_reserved,並加入到page->lru鏈表中。
void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
for (; start_pfn < end_pfn; start_pfn++) {
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
init_reserved_page(start_pfn);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
SetPageReserved(page);
}
}
}
我們看看重點的__free_memory_core,其主要是遍歷所有在memblock.memory中,但是不在memblock.reserve中的regions,對於imx的開發板,其信息如下
0: 0x0000000080000000 .. 0x0000000080003000
1: 0x0000000080008000 .. 0x0000000080200000
2: 0x00000000810e9000 .. 0x0000000080200000
3: 0x0000000088015000 .. 0x000000008bad3000
4:0x000000008bb41000 .. 0x000000008bb41000
5: 0x000000008bfff000 .. 0x000000008bfff000
static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = min_t(unsigned long,
PFN_DOWN(end), max_low_pfn);
if (start_pfn > end_pfn)
return 0;
__free_pages_memory(start_pfn, end_pfn);
return end_pfn - start_pfn;
}
核心的__free_pages_memory函數,該函數以順序爲單位釋放頁,清空各頁的PG_reserved位,設置pgae->count爲0後,然後調用__free_pages,代碼實現爲
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
}
-
首先檢查頁次數page->_refcount減1後的值是否爲0
-
爲0的時,即釋放1頁時調用free_hot_cold_page,否則調用__free_pages_ok,將頁以順序單位釋放
該過程比較複雜,涉及到夥伴系統的一些算法,先留一個疑問,後面深入分析下。
3. 高端內存釋放
static void __init free_highpages(void) { #ifdef CONFIG_HIGHMEM unsigned long max_low = max_low_pfn; struct memblock_region *mem, *res; /* set highmem page free */ for_each_memblock(memory, mem) { unsigned long start = memblock_region_memory_base_pfn(mem); unsigned long end = memblock_region_memory_end_pfn(mem); /* Ignore complete lowmem entries */ if (end <= max_low) continue; if (memblock_is_nomap(mem)) continue; /* Truncate partial highmem entries */ if (start < max_low) start = max_low; /* Find and exclude any reserved regions */ for_each_memblock(reserved, res) { unsigned long res_start, res_end; res_start = memblock_region_reserved_base_pfn(res); res_end = memblock_region_reserved_end_pfn(res); if (res_end < start) continue; if (res_start < start) res_start = start; if (res_start > end) res_start = end; if (res_end > end) res_end = end; if (res_start != start) free_area_high(start, res_start); start = res_end; if (start == end) break; } /* And now free anything which remains */ if (start < end) free_area_high(start, end); } #endif }
存在高端內存時,該代碼求出高端內存的起始頁幀和尾頁幀,然後調用free_area_high函數使夥伴系統管理空白頁,free_area_high函數在內部調用__free_page函數,將空白頁和一般內存區域共同釋放到夥伴系統。
4. 總結
本章的mem_init()函數結束啓動時的內存分配器memblock和bootmem,將bootmem和memblock管理的空白頁以順序單位構建列表,構建好的夥伴系統將爲Linux的內存分配器slab提供空白頁。