动态 tailcall
?静态 tailcall
?为什么 tailcall
会有动静之分呢?
其实,就是看在使用 bpf_taill_call()
时,传入的 index
参数是常量还是变量。
动态 tailcall
在使用 bpf_tail_call()
时,传入的 index
参数是变量;而且,该变量的值是在运行时才能确定的。如果该值在编译时就能推算出来,就会变成静态 tailcall
。
比如,该 index
是从某个 bpf map 里取出来的、或者从 ctx
里取出来的。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
SEC("kprobe/inet_csk_complete_hashdance")
int k_icsk_complete_hashdance(struct pt_regs *ctx)
{
struct sock *sk;
sk = (typeof(sk))PT_REGS_PARM2(ctx);
__u32 key = 0;
bpf_map_update_elem(&socks, &key, &sk, BPF_ANY);
u32 idx = BPF_CORE_READ(sk, __sk_common.skc_daddr);
bpf_tail_call(ctx, &progs, idx); // dynamic tailcall
return 0;
}
|
静态 tailcall
在使用 bpf_tail_call()
时,传入的 index
参数是常量;或者,index
的值在编译时就能推算出来。
1
2
3
4
5
6
7
8
9
10
11
12
13
|
SEC("kprobe/tcp_connect")
int k_tcp_connect(struct pt_regs *ctx)
{
struct sock *sk;
sk = (typeof(sk))PT_REGS_PARM1(ctx);
__u32 key = 0;
bpf_map_update_elem(&socks, &key, &sk, BPF_ANY);
bpf_tail_call_static(ctx, &progs, 0); // static tailcall
return 0;
}
|
P.S. demo 源代码:GitHub - Asphaltt/learn-by-example。
其中 bpf_tail_call_static()
的定义如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
#if __clang_major__ >= 8 && defined(__bpf__)
static __always_inline void
bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
{
if (!__builtin_constant_p(slot))
__bpf_unreachable();
/*
* Provide a hard guarantee that LLVM won't optimize setting r2 (map
* pointer) and r3 (constant map index) from _different paths_ ending
* up at the _same_ call insn as otherwise we won't be able to use the
* jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel
* given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key
* tracking for prog array pokes") for details on verifier tracking.
*
* Note on clobber list: we need to stay in-line with BPF calling
* convention, so even if we don't end up using r0, r4, r5, we need
* to mark them as clobber so that LLVM doesn't end up using them
* before / after the call.
*/
asm volatile("r1 = %[ctx]\n\t"
"r2 = %[map]\n\t"
"r3 = %[slot]\n\t"
"call 12"
:: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot)
: "r0", "r1", "r2", "r3", "r4", "r5");
}
#endif
|
所以,在使用静态 tailcall
时,最好使用 bpf_tail_call_static()
封装一下。
动静之分
为什么 tailcall
会有动静之分呢?
从运行时出发,静态 tailcall
性能更好、更安全。而动态 tailcall
的性能稍差,因为要查询一次数组。
直接看 x86 JIT 对 tailcall
的处理:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
// ${KERNEL}/arch/x86/net/bpf_jit_comp.c
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
// ...
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
&prog, image + addrs[i - 1],
callee_regs_used,
bpf_prog->aux->stack_depth,
ctx);
else
emit_bpf_tail_call_indirect(&prog,
callee_regs_used,
bpf_prog->aux->stack_depth,
image + addrs[i - 1],
ctx);
break;
// ...
}
/*
* Generate the following code:
*
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
* if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->ptrs[index];
* if (prog == NULL)
* goto out;
* goto *(prog->bpf_func + prologue_size);
* out:
*/
static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
u32 stack_depth, u8 *ip,
struct jit_context *ctx)
{
// ...
}
static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
u8 **pprog, u8 *ip,
bool *callee_regs_used, u32 stack_depth,
struct jit_context *ctx)
{
// ...
}
|
再来对比一下对应的 x86 汇编:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
# static tailcall, aka. emit_bpf_tail_call_direct()
0xffffffffc004effa: mov -0x14(%rbp),%eax # %eax = tailcall count
0xffffffffc004f000: cmp $0x21,%eax # %eax >= MAX_TAIL_CALL_CNT
0xffffffffc004f003: jae 0xffffffffc004f023 # goto out
0xffffffffc004f005: add $0x1,%eax # %eax++ == tailcall count++
0xffffffffc004f008: mov %eax,-0x14(%rbp) # save tailcall count to stack
0xffffffffc004f00e: nopl 0x0(%rax,%rax,1) # do nothing at stub
0xffffffffc004f013: pop %r13 # restore callee saved registers
0xffffffffc004f015: pop %rbx # restore callee saved registers
0xffffffffc004f016: pop %rax # restore callee saved registers
0xffffffffc004f017: add $0x10,%rsp # adjust stack depth
0xffffffffc004f01e: jmp 0xffffffffc004fae3 # long jmp to target tailcall prog's after-prologue part
0xffffffffc004f023: xor %eax,%eax
# dynamic tailcall, aka. emit_bpf_tail_call_indirect()
/*
* rdi - pointer to ctx
* rsi - pointer to bpf_array
* rdx - index in bpf_array
*/
0xffffffffc004fddb: mov -0x10(%rbp),%edx # %edx = index
0xffffffffc004fdde: mov %rbx,%rdi # %rdi = ctx
0xffffffffc004fde1: movabs $0xffff9e67028fee00,%rsi # %rsi = PROG_ARRAY bpf map
0xffffffffc004fdeb: mov %edx,%edx # %edx = index
0xffffffffc004fded: cmp %edx,0x24(%rsi) # %edx >= array->map.max_entries
0xffffffffc004fdf0: jbe 0xffffffffc004fe29 # goto out
0xffffffffc004fdf2: mov -0x14(%rbp),%eax # %eax = tailcall count
0xffffffffc004fdf8: cmp $0x21,%eax # %eax >= MAX_TAIL_CALL_CNT
0xffffffffc004fdfb: jae 0xffffffffc004fe29 # goto out
0xffffffffc004fdfd: add $0x1,%eax # %eax++ == tailcall count++
0xffffffffc004fe00: mov %eax,-0x14(%rbp) # save tailcall count to stack
0xffffffffc004fe06: mov 0x110(%rsi,%rdx,8),%rcx # %rcx = array->ptrs[index]
0xffffffffc004fe0e: test %rcx,%rcx # %rcx == NULL
0xffffffffc004fe11: je 0xffffffffc004fe29 # goto out
0xffffffffc004fe13: pop %rbx # restore callee saved registers
0xffffffffc004fe14: pop %rax # restore callee saved registers
0xffffffffc004fe15: add $0x10,%rsp # adjust stack depth
0xffffffffc004fe1c: mov 0x30(%rcx),%rcx # %rcx = prog->bpf_func
0xffffffffc004fe20: add $0xb,%rcx # %rcx += X86_TAIL_CALL_OFFSET
0xffffffffc004fe24: jmp 0xffffffffb58d8fc0 # long jmp to target tailcall prog's after-prologue part
0xffffffffc004fe29: xor %eax,%eax
|
静态 tailcall
使用了 11 条指令,而动态 tailcall
使用了 20 条指令。
这是因为静态 tailcall
不需要查询数组,而动态 tailcall
需要查询数组。
而且,在 PROG_ARRAY 没有更新的情况下,静态 tailcall
不会生成 long jmp
指令,而会使用 jmp
指令跳过几条指令。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# static tailcall, aka. emit_bpf_tail_call_direct(), without update
0xffffffffc004effe: mov -0x14(%rbp),%eax
0xffffffffc004f004: cmp $0x21,%eax
0xffffffffc004f007: jae 0xffffffffc004f027
0xffffffffc004f009: add $0x1,%eax
0xffffffffc004f00c: mov %eax,-0x14(%rbp)
0xffffffffc004f012: jmp 0xffffffffc004f027 # short jmp == goto out
0xffffffffc004f017: pop %r13
0xffffffffc004f019: pop %rbx
0xffffffffc004f01a: pop %rax
0xffffffffc004f01b: add $0x10,%rsp
0xffffffffc004f022: nopl 0x0(%rax,%rax,1)
0xffffffffc004f027: xor %eax,%eax
|
小结
tailcall
有动静之分;
- 静态
tailcall
性能更好、更安全;
- 动态
tailcall
性能稍差,更灵活;
更多 tailcall
细节,请查看 eBPF Talk: 更新 tailcall PROG_ARRAY bpf map。