eBPF Talk: introduce bpf_timer

bpf_timer 是 eBPF 里基于 hrtimer 实现的定时器。

hrtimers - subsystem for high-resolution kernel timers
LWN bpf: Introduce BPF timers.
bpf: Introduce bpf timers. since 5.15 kernel

bpf_timer 最初的需求是在 perf events bpf prog 中做定期采样；后来在 XDP 中可以用来做垃圾回收和健康检测。

`bpf_timer` demo

在 eBPF 代码里使用 bpf_timer 时，主要有两个步骤：

定义一个值包含 struct bpf_timer 的 bpf map。
在 bpf prog 里调用 bpf_timer_init()、bpf_timer_set_callback()、bpf_timer_start()。

示例代码如下：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58



struct tcp_timer {
    struct bpf_timer timer;
};

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, 10);
    __type(key, struct sk_key);
    __type(value, struct tcp_timer);
} tcp_timers SEC(".maps");

static int
timer_cb(struct bpf_map *map, struct sk_key *key, struct tcp_timer *timer)
{
    bpf_printk("timer_cb, new connection 0x%x -> 0x%x\n", key->saddr, key->daddr);
    return 0;
}

static __noinline void
handle_new_connection(void *ctx, struct sock *sk)
{
    // ...

    bpf_printk("handle_new_connection, new connection 0x%x -> 0x%x\n", key.saddr, key.daddr);

    struct tcp_timer init_timer = {};
    struct tcp_timer *timer = bpf_map_lookup_or_try_init(&tcp_timers, &key, &init_timer);
    // ...

    int ret;
    ret = bpf_timer_init(&timer->timer, &tcp_timers, CLOCK_BOOTTIME);
    //...

    ret = bpf_timer_set_callback(&timer->timer, timer_cb);
    // ...

    ret = bpf_timer_start(&timer->timer, 100, 0);
    // ...
}

SEC("fentry/tcp_connect")
int BPF_PROG(tcp_connect, struct sock *sk)
{
    handle_new_connection(ctx, sk);

    return 0;
}

SEC("fexit/inet_csk_complete_hashdance")
int BPF_PROG(inet_csk_complete_hashdance, struct sock *sk, struct sock *child,
    struct request_sock *req, bool own_req, struct sock *ret)
{
    if (ret)
        handle_new_connection(ctx, ret);

    return 0;
}

完整代码请看：GitHub - bpf-timer。

例子的运行效果如下：

1
2
3
4
5


# cat /sys/kernel/debug/tracing/trace_pipe
          <idle>-0       [001] ..s31  6447.898964: bpf_trace_printk: handle_new_connection, new connection 0x8a01a8c0 -> 0xc01a8c0
     ksoftirqd/1-23      [001] ..s1.  6447.899336: bpf_trace_printk: timer_cb, new connection 0x8a01a8c0 -> 0xc01a8c0
           <...>-7423    [000] ...11  6452.363356: bpf_trace_printk: handle_new_connection, new connection 0xf02000a -> 0x64aae940
            curl-7423    [000] ..s2.  6452.363578: bpf_trace_printk: timer_cb, new connection 0xf02000a -> 0x64aae940

`bpf_timer` 限制

不过，bpf_timer 有不少限制；甚至，上面的 demo 改成 XDP 就不能运行了。

首先，bpf_timer 只能在如下 bpf map 里使用：

BPF_MAP_TYPE_HASH
BPF_MAP_TYPE_LRU_HASH
BPF_MAP_TYPE_ARRAY
map in map

其次，bpf_timer 不支持 tracing bpf prog：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


// ${KERNEL}/kernel/bpf/verifier.c

static bool is_tracing_prog_type(enum bpf_prog_type type)
{
    switch (type) {
    case BPF_PROG_TYPE_KPROBE:
    case BPF_PROG_TYPE_TRACEPOINT:
    case BPF_PROG_TYPE_PERF_EVENT:
    case BPF_PROG_TYPE_RAW_TRACEPOINT:
    case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
        return true;
    default:
        return false;
    }
}

static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                    struct bpf_map *map,
                    struct bpf_prog *prog)

{
    // ...
    if (btf_record_has_field(map->record, BPF_TIMER)) {
        if (is_tracing_prog_type(prog_type)) {
            verbose(env, "tracing progs cannot use bpf_timer yet\n");
            return -EINVAL;
        }
    }
    // ...
}

接着，bpf_timer 不能和 struct bpf_spin_lock 一起使用，因为 bpf_timer 在内核运行的时候的真实面貌如下：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


// ${KERNEL}/kernel/bpf/helpers.c

/* the actual struct hidden inside uapi struct bpf_timer */
struct bpf_timer_kern {
    struct bpf_hrtimer *timer;
    /* bpf_spin_lock is used here instead of spinlock_t to make
     * sure that it always fits into space reserved by struct bpf_timer
     * regardless of LOCKDEP and spinlock debug flags.
     */
    struct bpf_spin_lock lock;
} __attribute__((aligned(8)));

最后，bpf_timer_init() 只支持 CLOCK_MONOTONIC、CLOCK_REALTIME、CLOCK_BOOTTIME 三种 clockid。如果对时间精度不敏感，任选一个即可。

额外情况，在 NMI context 里，bpf_timer helpers 都会返回 -EOPNOTSUPP。

小结

bpf_timer 是 eBPF 里基于 hrtimer 实现的定时器。

在使用 bpf_timer 时，需要注意以上限制。

文章目录

bpf_timer demo

bpf_timer 限制

小结

`bpf_timer` demo

`bpf_timer` 限制