eBPF Talk: 动态或静态 tailcall

动态 tailcall？静态 tailcall？为什么 tailcall 会有动静之分呢？

其实，就是看在使用 bpf_taill_call() 时，传入的 index 参数是常量还是变量。

动态 `tailcall`

在使用 bpf_tail_call() 时，传入的 index 参数是变量；而且，该变量的值是在运行时才能确定的。如果该值在编译时就能推算出来，就会变成静态 tailcall。

比如，该 index 是从某个 bpf map 里取出来的、或者从 ctx 里取出来的。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


SEC("kprobe/inet_csk_complete_hashdance")
int k_icsk_complete_hashdance(struct pt_regs *ctx)
{
    struct sock *sk;
    sk = (typeof(sk))PT_REGS_PARM2(ctx);

    __u32 key = 0;
    bpf_map_update_elem(&socks, &key, &sk, BPF_ANY);

    u32 idx = BPF_CORE_READ(sk, __sk_common.skc_daddr);
    bpf_tail_call(ctx, &progs, idx);        // dynamic tailcall

    return 0;
}

静态 `tailcall`

在使用 bpf_tail_call() 时，传入的 index 参数是常量；或者，index 的值在编译时就能推算出来。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


SEC("kprobe/tcp_connect")
int k_tcp_connect(struct pt_regs *ctx)
{
    struct sock *sk;
    sk = (typeof(sk))PT_REGS_PARM1(ctx);

    __u32 key = 0;
    bpf_map_update_elem(&socks, &key, &sk, BPF_ANY);

    bpf_tail_call_static(ctx, &progs, 0);   // static tailcall

    return 0;
}

P.S. demo 源代码：GitHub - Asphaltt/learn-by-example。

其中 bpf_tail_call_static() 的定义如下：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


#if __clang_major__ >= 8 && defined(__bpf__)
static __always_inline void
bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
{
    if (!__builtin_constant_p(slot))
        __bpf_unreachable();

    /*
     * Provide a hard guarantee that LLVM won't optimize setting r2 (map
     * pointer) and r3 (constant map index) from _different paths_ ending
     * up at the _same_ call insn as otherwise we won't be able to use the
     * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel
     * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key
     * tracking for prog array pokes") for details on verifier tracking.
     *
     * Note on clobber list: we need to stay in-line with BPF calling
     * convention, so even if we don't end up using r0, r4, r5, we need
     * to mark them as clobber so that LLVM doesn't end up using them
     * before / after the call.
     */
    asm volatile("r1 = %[ctx]\n\t"
             "r2 = %[map]\n\t"
             "r3 = %[slot]\n\t"
             "call 12"
             :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot)
             : "r0", "r1", "r2", "r3", "r4", "r5");
}
#endif

所以，在使用静态 tailcall 时，最好使用 bpf_tail_call_static() 封装一下。

动静之分

为什么 tailcall 会有动静之分呢？

从运行时出发，静态 tailcall 性能更好、更安全。而动态 tailcall 的性能稍差，因为要查询一次数组。

直接看 x86 JIT 对 tailcall 的处理：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53


// ${KERNEL}/arch/x86/net/bpf_jit_comp.c

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
          int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
    // ...

            case BPF_JMP | BPF_TAIL_CALL:
            if (imm32)
                emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
                              &prog, image + addrs[i - 1],
                              callee_regs_used,
                              bpf_prog->aux->stack_depth,
                              ctx);
            else
                emit_bpf_tail_call_indirect(&prog,
                                callee_regs_used,
                                bpf_prog->aux->stack_depth,
                                image + addrs[i - 1],
                                ctx);
            break;

        // ...
}

/*
 * Generate the following code:
 *
 * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
 *   if (index >= array->map.max_entries)
 *     goto out;
 *   if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
 *     goto out;
 *   prog = array->ptrs[index];
 *   if (prog == NULL)
 *     goto out;
 *   goto *(prog->bpf_func + prologue_size);
 * out:
 */
static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
                                        u32 stack_depth, u8 *ip,
                                        struct jit_context *ctx)
{
    // ...
}

static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
                                      u8 **pprog, u8 *ip,
                                      bool *callee_regs_used, u32 stack_depth,
                                      struct jit_context *ctx)
{
    // ...
}

再来对比一下对应的 x86 汇编：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


# static tailcall, aka. emit_bpf_tail_call_direct()

   0xffffffffc004effa:  mov    -0x14(%rbp),%eax     # %eax = tailcall count
   0xffffffffc004f000:  cmp    $0x21,%eax           # %eax >= MAX_TAIL_CALL_CNT
   0xffffffffc004f003:  jae    0xffffffffc004f023   # goto out
   0xffffffffc004f005:  add    $0x1,%eax            # %eax++ == tailcall count++
   0xffffffffc004f008:  mov    %eax,-0x14(%rbp)     # save tailcall count to stack
   0xffffffffc004f00e:  nopl   0x0(%rax,%rax,1)     # do nothing at stub
   0xffffffffc004f013:  pop    %r13                 # restore callee saved registers
   0xffffffffc004f015:  pop    %rbx                 # restore callee saved registers
   0xffffffffc004f016:  pop    %rax                 # restore callee saved registers
   0xffffffffc004f017:  add    $0x10,%rsp           # adjust stack depth
   0xffffffffc004f01e:  jmp    0xffffffffc004fae3   # long jmp to target tailcall prog's after-prologue part
   0xffffffffc004f023:  xor    %eax,%eax

# dynamic tailcall, aka. emit_bpf_tail_call_indirect()

    /*
     * rdi - pointer to ctx
     * rsi - pointer to bpf_array
     * rdx - index in bpf_array
     */

   0xffffffffc004fddb:  mov    -0x10(%rbp),%edx         # %edx = index
   0xffffffffc004fdde:  mov    %rbx,%rdi                # %rdi = ctx
   0xffffffffc004fde1:  movabs $0xffff9e67028fee00,%rsi # %rsi = PROG_ARRAY bpf map
   0xffffffffc004fdeb:  mov    %edx,%edx                # %edx = index
   0xffffffffc004fded:  cmp    %edx,0x24(%rsi)          # %edx >= array->map.max_entries
   0xffffffffc004fdf0:  jbe    0xffffffffc004fe29       # goto out
   0xffffffffc004fdf2:  mov    -0x14(%rbp),%eax         # %eax = tailcall count
   0xffffffffc004fdf8:  cmp    $0x21,%eax               # %eax >= MAX_TAIL_CALL_CNT
   0xffffffffc004fdfb:  jae    0xffffffffc004fe29       # goto out
   0xffffffffc004fdfd:  add    $0x1,%eax                # %eax++ == tailcall count++
   0xffffffffc004fe00:  mov    %eax,-0x14(%rbp)         # save tailcall count to stack
   0xffffffffc004fe06:  mov    0x110(%rsi,%rdx,8),%rcx  # %rcx = array->ptrs[index]
   0xffffffffc004fe0e:  test   %rcx,%rcx                # %rcx == NULL
   0xffffffffc004fe11:  je     0xffffffffc004fe29       # goto out
   0xffffffffc004fe13:  pop    %rbx                     # restore callee saved registers
   0xffffffffc004fe14:  pop    %rax                     # restore callee saved registers
   0xffffffffc004fe15:  add    $0x10,%rsp               # adjust stack depth
   0xffffffffc004fe1c:  mov    0x30(%rcx),%rcx          # %rcx = prog->bpf_func
   0xffffffffc004fe20:  add    $0xb,%rcx                # %rcx += X86_TAIL_CALL_OFFSET
   0xffffffffc004fe24:  jmp    0xffffffffb58d8fc0       # long jmp to target tailcall prog's after-prologue part
   0xffffffffc004fe29:  xor    %eax,%eax

静态 tailcall 使用了 11 条指令，而动态 tailcall 使用了 20 条指令。

这是因为静态 tailcall 不需要查询数组，而动态 tailcall 需要查询数组。

而且，在 PROG_ARRAY 没有更新的情况下，静态 tailcall 不会生成 long jmp 指令，而会使用 jmp 指令跳过几条指令。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


# static tailcall, aka. emit_bpf_tail_call_direct(), without update

   0xffffffffc004effe:  mov    -0x14(%rbp),%eax
   0xffffffffc004f004:  cmp    $0x21,%eax
   0xffffffffc004f007:  jae    0xffffffffc004f027
   0xffffffffc004f009:  add    $0x1,%eax
   0xffffffffc004f00c:  mov    %eax,-0x14(%rbp)
   0xffffffffc004f012:  jmp    0xffffffffc004f027   # short jmp == goto out
   0xffffffffc004f017:  pop    %r13
   0xffffffffc004f019:  pop    %rbx
   0xffffffffc004f01a:  pop    %rax
   0xffffffffc004f01b:  add    $0x10,%rsp
   0xffffffffc004f022:  nopl   0x0(%rax,%rax,1)
   0xffffffffc004f027:  xor    %eax,%eax

小结

tailcall 有动静之分；
静态 tailcall 性能更好、更安全；
动态 tailcall 性能稍差，更灵活；

更多 tailcall 细节，请查看 eBPF Talk: 更新 tailcall PROG_ARRAY bpf map。

eBPF Talk: 动态或静态 tailcall

文章目录

动态 `tailcall`

静态 `tailcall`

动静之分

小结

知识星球

星球里的专栏：

《XDP 进阶手册》

文章目录

动态 tailcall

静态 tailcall

动静之分

小结

知识星球

星球里的专栏：

《XDP 进阶手册》

动态 `tailcall`

静态 `tailcall`