在 eBPF Talk: bpf2bpf 特性简介 中已介绍了 bpf2bpf 特性,同时有 demo 介绍该怎么使用该特性。
在该特性神秘面纱的背后,到底是怎样的呢?让我娓娓道来。
编译阶段
不懂编译器 clang 中 bpf 那部分源代码,就先略过编译过程,直接看看编译结果吧。
以 bpf2bpf example 为例,使用 llvm-objdump -S tcpconn_bpfel.o
查看汇编代码,只能看到如下结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
$ llvm-objdump -S tcpconn_bpfel.o
tcpconn_bpfel.o: file format ELF64-BPF
Disassembly of section .text:
0000000000000000 handle_new_connection:
llvm-objdump: warning: 'tcpconn_bpfel.o': failed to parse debug information for tcpconn_bpfel.o
xx: ...
49: 95 00 00 00 00 00 00 00 exit
Disassembly of section kprobe/tcp_connect:
0000000000000000 k_tcp_connect:
0: 79 12 70 00 00 00 00 00 r2 = *(u64 *)(r1 + 112)
1: 85 10 00 00 ff ff ff ff call -1 // seudo call
2: b7 00 00 00 00 00 00 00 r0 = 0
3: 95 00 00 00 00 00 00 00 exit
Disassembly of section kprobe/inet_csk_complete_hashdance:
0000000000000000 k_icsk_complete_hashdance:
0: 79 12 68 00 00 00 00 00 r2 = *(u64 *)(r1 + 104)
1: 85 10 00 00 ff ff ff ff call -1 // seudo call
2: b7 00 00 00 00 00 00 00 r0 = 0
3: 95 00 00 00 00 00 00 00 exit
|
呃,尴尬了。因为使用了 eBPF CO-RE,导致 llvm-objdump -S
看不了 C 源代码了。那就换一种姿势来查看 eBPF 汇编吧。
(关于 eBPF 汇编的一个有趣对比:eBPF Talk: 此汇编非彼汇编。)
使用如下 Go 代码来查看采用了 CO-RE 编译的 eBPF 汇编:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
package main
import (
"fmt"
"log"
"os"
"github.com/cilium/ebpf"
)
func must[T any](x T, err error) T {
if err != nil {
log.Fatal(err)
}
return x
}
func main() {
if len(os.Args) != 2 {
log.Fatal("miss elf file")
}
elfFile := os.Args[1]
fd := must(os.Open(elfFile))
defer fd.Close()
spec := must(ebpf.LoadCollectionSpecFromReader(fd))
for _, prog := range spec.Programs {
fmt.Printf("%v\n", prog.Instructions)
}
}
|
结果如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
k_icsk_complete_hashdance:
; sk = (typeof(sk))PT_REGS_PARM2(ctx);
0: LdXMemDW dst: r2 src: r1 off: 104 imm: 0
; handle_new_connection(ctx, sk);
1: Call -1 <handle_new_connection>
; return 0;
2: MovImm dst: r0 imm: 0
3: Exit
handle_new_connection:
; handle_new_connection(void *ctx, struct sock *sk)
4: MovReg dst: r8 src: r2
5: MovReg dst: r6 src: r1
6: MovImm dst: r1 imm: 0
...
k_tcp_connect:
; sk = (typeof(sk))PT_REGS_PARM1(ctx);
0: LdXMemDW dst: r2 src: r1 off: 112 imm: 0
; handle_new_connection(ctx, sk);
1: Call -1 <handle_new_connection>
; return 0;
2: MovImm dst: r0 imm: 0
3: Exit
handle_new_connection:
; handle_new_connection(void *ctx, struct sock *sk)
4: MovReg dst: r8 src: r2
5: MovReg dst: r6 src: r1
6: MovImm dst: r1 imm: 0
...
|
由此可知,在 eBPF ELF 文件中,call
指令跟目标函数之间有关联。
到底是怎么关联在一起的呢?在加载阶段能看出一二。
加载阶段
在加载阶段,对于 bpf2bpf 特性,需要做如下处理:
- 解析 ELF 文件中的 eBPF 汇编
- 找到
call
指令跟目标函数之间的关系
- 将多个 eBPF 汇编片段合并成一个
- 修正
call
指令
cilium/ebpf
中简略的函数调用路径如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
LoadCollectionSpecFromReader() // 第 1 步
|-->loadRelocations()
|-->loadSectionRelocations()
|-->loadProgramSections()
|-->loadFunctions()
| |-->func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) error {
| case programSection:
| case elf.STT_SECTION: // 第 2 步
|
| // The function we want to call is in the indicated section,
| // at the offset encoded in the instruction itself. Reverse
| // the calculation to find the real function we're looking for.
| // A value of -1 references the first instruction in the section.
| offset := int64(int32(ins.Constant)+1) * asm.InstructionSize
| sym, ok := target.symbols[uint64(offset)]
| if !ok {
| return fmt.Errorf("call: no symbol at offset %d", offset)
| }
|
| name = sym.Name
| ins.Constant = -1
|
| *ins = ins.WithReference(name)
| }
|-->flattenPrograms()
|-->flattenInstructions() // 第 3 步
LoadAndAssign()
|-->getValue()
|-->loadProgram()
|-->newProgramWithOptions()
|-->insns.Marshal(buf, internal.NativeEndian)
|-->insns.encodeFunctionReferences() { // 第 4 步
// Find all instructions tagged as references to other symbols.
// Depending on the instruction type, populate their constant or offset
// fields to point to the symbol they refer to within the insn stream.
iter = insns.Iterate()
for iter.Next() {
offset := iter.Offset
ins := iter.Ins
switch {
case ins.IsFunctionReference() && ins.Constant == -1:
symOffset, ok := symbolOffsets[ins.Reference()]
ins.Constant = int64(symOffset - offset - 1)
}
|-->fd, err := sys.ProgLoad(attr) // 将 eBPF prog load 到内核,此时内核使用校验器进行校验
|-->fd, err := BPF(BPF_PROG_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
|-->r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size)
|
校验阶段
就不讲解 eBPF prog load 过程了,直接看下校验器是怎么处理 bpf2bpf 特性的吧。
- JIT 将 eBPF 汇编翻译成机器码
- 重新计算指令的
imm
属性
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
|-->bpf_prog_load()
|-->bpf_check() // run eBPF verifier
|-->fixup_call_args()
|-->jit_subprogs() {
func[i] = bpf_int_jit_compile(func[i]); // 每个平台各自实现 bpf_int_jit_compile 函数
// ...
/* at this point all bpf functions were successfully JITed
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
*/
for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
subprog = insn->off;
insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
__bpf_call_base; // 重新计算 imm,__bpf_call_base + imm 即是目标函数调用的起始地址
}
/* we use the aux data to keep a list of the start addresses
* of the JITed images for each function in the program
*
* for some architectures, such as powerpc64, the imm field
* might not be large enough to hold the offset of the start
* address of the callee's JITed image from __bpf_call_base
*
* in such cases, we can lookup the start address of a callee
* by using its subprog id, available from the off field of
* the call instruction, as an index for this list
*/
func[i]->aux->func = func;
func[i]->aux->func_cnt = env->subprog_cnt;
}
for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
tmp = bpf_int_jit_compile(func[i]);
if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
err = -ENOTSUPP;
goto out_free;
}
cond_resched();
}
//...
}
|
运行阶段
一切皆已准备好,就看看运行阶段是怎么跑起来的。
- 执行 JIT’ed 机器码
- 遇到
call
指令,进行一次函数调用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
// include/linux/filter.h
#define BPF_PROG_RUN(prog, ctx) ({ \
u32 ret; \
cant_sleep(); \
if (static_branch_unlikely(&bpf_stats_enabled_key)) { \
struct bpf_prog_stats *stats; \
u64 start = sched_clock(); \
ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \
stats = this_cpu_ptr(prog->aux->stats); \
u64_stats_update_begin(&stats->syncp); \
stats->cnt++; \
stats->nsecs += sched_clock() - start; \
u64_stats_update_end(&stats->syncp); \
} else { \
ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \
} \
ret; })
|
此时直接解析执行 bpf 汇编指令了。而 bpf_func
属性是在 bpf prog load 的时候就准备好了。
暂且跳过 bpf prog load 过程,直接看看 bpf_func
执行中对 call
指令的处理吧。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
// kernel/bpf/core.c
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
// ...
/* CALL */
JMP_CALL:
/* Function call scratches BPF_R1-BPF_R5 registers,
* preserves BPF_R6-BPF_R9, and stores return value
* into BPF_R0.
*/
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
BPF_R4, BPF_R5);
CONT;
// ...
}
|
比较简单,直接调用 __bpf_call_base + imm
得到的函数。
小结
有了此次探究 bpf2bpf 特性的经验,后面探究其他特性就方便多了。期待后面更多的特性探索吧。