内核内存稳定性新特性：Page Table Check 机制解读

Linux内核中内存损坏一直是极难定位但又较为常见的一类问题。在内核中已经有较多的机制来拦截此类问题。比如Kasan/Kfence等等。而内核自5.17版本起又引入了Page Table Check机制，用来检测某些page计数异常导致的内存损坏问题。

一、为何引入Page Table Check机制：

Google 的工程师在分析一个进程的dump时，无意间发现了一页不属于该进程的内存。进一步研究发现了内核自4.14起就存在的内存page引用计数的bug。为化解此类内存缺陷，Google 提出了一个全新的“页表检查”（Page Table Check）解决方案。

我们看看 Google 的修复patch及问题发生的原因：

---
 kernel/events/core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 236e7900e3fc..0736508d595b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6110,7 +6110,6 @@ void perf_output_sample(struct perf_output_handle *handle,
 static u64 perf_virt_to_phys(u64 virt)
 {
         u64 phys_addr = 0;
-        struct page *p = NULL;
 
         if (!virt)
                 return 0;
@@ -6129,14 +6128,15 @@ static u64 perf_virt_to_phys(u64 virt)
                  * If failed, leave phys_addr as 0.
                  */
                 if (current->mm != NULL) {
+                        struct page *p;
+
                         pagefault_disable();
-                        if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+                        if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
                                 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+                                put_page(p);
+                        }
                         pagefault_enable();
                 }
-
-                if (p)
-                        put_page(p);
         }
 
         return phys_addr;
--

问题发生的根因就在于__get_user_pages_fast函数可能存在先对page指针p赋值了，但是后续因为某个错误直接返回。在此场景下__get_user_pages_fast中是没有调用get_page来增加引用计数的，因此后续的put_page是多余的，会导致引用计数下溢。修复方式其实比较简单，只有在__get_user_pages_fast成功时，才put_page。

此问题是很隐秘的，在内核中存在了很长时间。正因为如此，google才推出了Page Table Check机制，希望在第一时间拦截此类问题。

二、Page Table Check 机制的实现：

新增一个page_ext记录当前page的映射是匿名或者文件映射。在每次映射关系改变时，会判断当前的映射标记，如果出现不允许的情况就会主动panic，保留第一现场。

具体规则如下：

当前映射	新映射	映射权限	规则
匿名	匿名	读	允许
匿名	匿名	读/写	禁止
匿名	文件	任何	禁止
文件	匿名	任何	禁止
文件	文件	任何	允许

我们来看看这个规则的代码实现。在有新的映射发生时，会根据page现有的file_map_count/anon_map_count的标志来判断映射是否合法，并且会修改标志值。

static void page_table_check_set(struct mm_struct *mm, unsigned long addr,                                 unsigned long pfn, unsigned long pgcnt,
                                 bool rw)
{
        struct page_ext *page_ext;
        struct page *page;
        unsigned long i;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        page_ext = page_ext_get(page);
        anon = PageAnon(page);

        for (i = 0; i < pgcnt; i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
                }
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

代码的调用逻辑如下：

缺页中断
  → handle_mm_fault
    → handle_pte_fault
      → do_anonymous_page / do_fault
        → set_pte_at
          → page_table_check_set
            ↳ 判断是否合法

同样在unmap时，也会调用page_table_check_clear来判断当前的标志位。

static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,                                   unsigned long pfn, unsigned long pgcnt)
{
        struct page_ext *page_ext;
        struct page *page;
        unsigned long i;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        page_ext = page_ext_get(page);
        anon = PageAnon(page);

        for (i = 0; i < pgcnt; i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
                }
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

而在分配 alloc_pages() 和释放 free_pages_prepare() 内存的时候也会调用__page_table_check_zero，保证当前内存没有被映射。

void __page_table_check_zero(struct page *page, unsigned int order){
        struct page_ext *page_ext;
        unsigned long i;

        page_ext = page_ext_get(page);
        BUG_ON(!page_ext);
        for (i = 0; i < (1ul << order); i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                BUG_ON(atomic_read(&ptc->anon_map_count));
                BUG_ON(atomic_read(&ptc->file_map_count));
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

三、Page Table Check 机制的配置：

要使用Page Table Check机制，需要在编译的时候使能PAGE_TABLE_CHECK=y。并且需要在内核的cmdline中增加”page_table_check=on”或者在kconfig中使能CONFIG_PAGE_TABLE_CHECK_ENFORCED=y。
比如，我们的配置如下：

1 2	CONFIG_PAGE_TABLE_CHECK=y CONFIG_PAGE_TABLE_CHECK_ENFORCED=y

四、测试方式：

前面提到引入page table check的起因是因为异常调用put_page导致的。那么我们人为构建一个多次调用put_page的测试程序来看看page table check如何生效的吧。

测试程序的代码如下：

void test_page_table_check(void)
{
        unsigned long addr;
        struct task_struct *task =  find_task_by_vpid(1);
        struct vm_area_struct *vma;
        struct mm_struct *mm =  task->mm;

        struct page *page;

        addr = mm->mmap_base - PAGE_SIZE;
        mmap_read_lock(mm);
        vma = find_vma(mm, addr);
        page = follow_page(vma, addr, FOLL_GET);
        put_page(page);
        mmap_read_unlock(mm);

        put_page(page);
}

首先获取init进程中mmap分配的一页page。我们故意在最后多操作了一遍put_page。从而导致page的引用计数为0，因此会释放掉此内存，而在释放内存时，__page_table_check_zero检查到anon_map_count不为0，因此panic了。

具体的调用过程可以参考下面的堆栈：

[  132.032451][2:3990:sh] ------------[ cut here ]------------
[  132.032453][2:3990:sh] kernel BUG at mm/page_table_check.c:143!
[  132.032458][2:3990:sh] Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP
[  132.032822][2:3990:sh] CPU: 2 PID: 3990 Comm: sh Tainted: G S      W  OE      6.1.25-android14-11-maybe-dirty-qki-consolidate #1
[  132.032829][2:3990:sh] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--)
[  132.032833][2:3990:sh] pc : __page_table_check_zero+0xcc/0xdc
[  132.032843][2:3990:sh] lr : __page_table_check_zero+0x30/0xdc
[  132.032846][2:3990:sh] sp : ffffffc03094bb10
[  132.032848][2:3990:sh] x29: ffffffc03094bb10 x28: ffffff88c4a00000 x27: 0000000000000000
[  132.032854][2:3990:sh] x26: ffffffe31e256000 x25: ffffffe31e256000 x24: 0000000000000001
[  132.032858][2:3990:sh] x23: 0000000000000000 x22: ffffffe31d36b523 x21: ffffffe31d3cac1c
[  132.032862][2:3990:sh] x20: ffffff8023a81760 x19: fffffffe01baba40 x18: ffffffe31e18b240
[  132.032866][2:3990:sh] x17: 00000000ad6b63b6 x16: 00000000ad6b63b6 x15: ffffffe31c2ad328
[  132.032870][2:3990:sh] x14: ffffffe31b7466fc x13: ffffffc030948000 x12: ffffffc03094c000
[  132.032874][2:3990:sh] x11: 0000000000000060 x10: ffffffe31e178720 x9 : 0000000000000001
[  132.032878][2:3990:sh] x8 : ffffff8023a817b8 x7 : ffffffe31c18cda4 x6 : ffffffe31c1e5ee8
[  132.032882][2:3990:sh] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000002
[  132.032886][2:3990:sh] x2 : 0000000000000000 x1 : ffffffe31d3bf383 x0 : ffffff8023a81760
[  132.032890][2:3990:sh] Call trace:
[  132.032892][2:3990:sh]  __page_table_check_zero+0xcc/0xdc
[  132.032896][2:3990:sh]  free_unref_page_prepare+0x36c/0x42c
[  132.032903][2:3990:sh]  free_unref_page+0x58/0x268
[  132.032907][2:3990:sh]  __folio_put+0x54/0x80
[  132.032917][2:3990:sh]  test_page_table_check+0x114/0x1f8 [mz_stability_test]
[  132.032930][2:3990:sh]  proc_generate_oops_write+0x960/0xa18 [mz_stability_test]
[  132.032939][2:3990:sh]  proc_reg_write+0xfc/0x170
[  132.032949][2:3990:sh]  vfs_write+0x110/0x2d0
[  132.032956][2:3990:sh]  ksys_write+0x80/0xf0
[  132.032960][2:3990:sh]  __arm64_sys_write+0x24/0x34
[  132.032965][2:3990:sh]  invoke_syscall+0x60/0x124
[  132.032975][2:3990:sh]  el0_svc_common+0xcc/0x118
[  132.032980][2:3990:sh]  do_el0_svc+0x34/0xb8
[  132.032984][2:3990:sh]  el0_svc+0x30/0xb0
[  132.032992][2:3990:sh]  el0t_64_sync_handler+0x68/0xb4
[  132.032996][2:3990:sh]  el0t_64_sync+0x1a0/0x1a4

五、小结

在page table操作时增加校验，从而检查是否存在非法共享等人为软件漏洞，提前发现问题，确保防止某些内存损坏。在生产环境和研发阶段，对硬件和工艺原因导致的随机内存跳变问题，也会有所帮助。

六、参考

https://lwn.net/Articles/876264/
https://lore.kernel.org/all/xr9335nxwc5y.fsf@gthelen2.svl.corp.google.com/

内核内存稳定性新特性：Page Table Check 机制解读

一、为何引入Page Table Check机制：

二、Page Table Check 机制的实现：

三、Page Table Check 机制的配置：

四、测试方式：

五、小结

六、参考

FEATURED TAGS

FRIENDS

一、 为何引入Page Table Check机制：

二、Page Table Check 机制的实现：

三、Page Table Check 机制的配置：

四、测试方式：

五、小结

六、参考

FEATURED TAGS

FRIENDS

一、为何引入Page Table Check机制：