eBPF Talk: bpf2bpf 特性简介 中已介绍了 bpf2bpf 特性,同时有 demo 介绍该怎么使用该特性。

在该特性神秘面纱的背后,到底是怎样的呢?让我娓娓道来。

编译阶段

不懂编译器 clang 中 bpf 那部分源代码,就先略过编译过程,直接看看编译结果吧。

bpf2bpf example 为例,使用 llvm-objdump -S tcpconn_bpfel.o 查看汇编代码,只能看到如下结果:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
$ llvm-objdump -S tcpconn_bpfel.o

tcpconn_bpfel.o:    file format ELF64-BPF


Disassembly of section .text:

0000000000000000 handle_new_connection:
llvm-objdump: warning: 'tcpconn_bpfel.o': failed to parse debug information for tcpconn_bpfel.o
      xx:   ...
      49:    95 00 00 00 00 00 00 00    exit

Disassembly of section kprobe/tcp_connect:

0000000000000000 k_tcp_connect:
       0:    79 12 70 00 00 00 00 00    r2 = *(u64 *)(r1 + 112)
       1:    85 10 00 00 ff ff ff ff    call -1 // seudo call
       2:    b7 00 00 00 00 00 00 00    r0 = 0
       3:    95 00 00 00 00 00 00 00    exit

Disassembly of section kprobe/inet_csk_complete_hashdance:

0000000000000000 k_icsk_complete_hashdance:
       0:    79 12 68 00 00 00 00 00    r2 = *(u64 *)(r1 + 104)
       1:    85 10 00 00 ff ff ff ff    call -1 // seudo call
       2:    b7 00 00 00 00 00 00 00    r0 = 0
       3:    95 00 00 00 00 00 00 00    exit

呃,尴尬了。因为使用了 eBPF CO-RE,导致 llvm-objdump -S 看不了 C 源代码了。那就换一种姿势来查看 eBPF 汇编吧。 (关于 eBPF 汇编的一个有趣对比:eBPF Talk: 此汇编非彼汇编。)

使用如下 Go 代码来查看采用了 CO-RE 编译的 eBPF 汇编:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package main

import (
    "fmt"
    "log"
    "os"

    "github.com/cilium/ebpf"
)

func must[T any](x T, err error) T {
    if err != nil {
        log.Fatal(err)
    }
    return x
}

func main() {
    if len(os.Args) != 2 {
        log.Fatal("miss elf file")
    }

    elfFile := os.Args[1]
    fd := must(os.Open(elfFile))
    defer fd.Close()

    spec := must(ebpf.LoadCollectionSpecFromReader(fd))
    for _, prog := range spec.Programs {
        fmt.Printf("%v\n", prog.Instructions)
    }
}

结果如下:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
k_icsk_complete_hashdance:
       ; sk = (typeof(sk))PT_REGS_PARM2(ctx);
      0: LdXMemDW dst: r2 src: r1 off: 104 imm: 0
       ; handle_new_connection(ctx, sk);
      1: Call -1 <handle_new_connection>
       ; return 0;
      2: MovImm dst: r0 imm: 0
      3: Exit
handle_new_connection:
       ; handle_new_connection(void *ctx, struct sock *sk)
      4: MovReg dst: r8 src: r2
      5: MovReg dst: r6 src: r1
      6: MovImm dst: r1 imm: 0
      ...

k_tcp_connect:
       ; sk = (typeof(sk))PT_REGS_PARM1(ctx);
      0: LdXMemDW dst: r2 src: r1 off: 112 imm: 0
       ; handle_new_connection(ctx, sk);
      1: Call -1 <handle_new_connection>
       ; return 0;
      2: MovImm dst: r0 imm: 0
      3: Exit
handle_new_connection:
       ; handle_new_connection(void *ctx, struct sock *sk)
      4: MovReg dst: r8 src: r2
      5: MovReg dst: r6 src: r1
      6: MovImm dst: r1 imm: 0
      ...

由此可知,在 eBPF ELF 文件中,call 指令跟目标函数之间有关联。 到底是怎么关联在一起的呢?在加载阶段能看出一二。

加载阶段

在加载阶段,对于 bpf2bpf 特性,需要做如下处理:

  1. 解析 ELF 文件中的 eBPF 汇编
  2. 找到 call 指令跟目标函数之间的关系
  3. 将多个 eBPF 汇编片段合并成一个
  4. 修正 call 指令

cilium/ebpf 中简略的函数调用路径如下:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
LoadCollectionSpecFromReader() // 第 1 步
|-->loadRelocations()
    |-->loadSectionRelocations()
|-->loadProgramSections()
    |-->loadFunctions()
    |   |-->func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) error {
    |           case programSection:
    |               case elf.STT_SECTION: // 第 2 步
    |
    |               // The function we want to call is in the indicated section,
    |               // at the offset encoded in the instruction itself. Reverse
    |               // the calculation to find the real function we're looking for.
    |               // A value of -1 references the first instruction in the section.
    |               offset := int64(int32(ins.Constant)+1) * asm.InstructionSize
    |               sym, ok := target.symbols[uint64(offset)]
    |               if !ok {
    |                   return fmt.Errorf("call: no symbol at offset %d", offset)
    |               }
    |
    |               name = sym.Name
    |               ins.Constant = -1
    |
    |           *ins = ins.WithReference(name)
    |       }
    |-->flattenPrograms()
        |-->flattenInstructions() // 第 3 步

LoadAndAssign()
|-->getValue()
    |-->loadProgram()
        |-->newProgramWithOptions()
            |-->insns.Marshal(buf, internal.NativeEndian)
                |-->insns.encodeFunctionReferences() { // 第 4 步
                        // Find all instructions tagged as references to other symbols.
                        // Depending on the instruction type, populate their constant or offset
                        // fields to point to the symbol they refer to within the insn stream.
                        iter = insns.Iterate()
                        for iter.Next() {
                            offset := iter.Offset
                            ins := iter.Ins

                            switch {
                            case ins.IsFunctionReference() && ins.Constant == -1:
                                symOffset, ok := symbolOffsets[ins.Reference()]
                                ins.Constant = int64(symOffset - offset - 1)
                    }
            |-->fd, err := sys.ProgLoad(attr) // 将 eBPF prog load 到内核,此时内核使用校验器进行校验
                |-->fd, err := BPF(BPF_PROG_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
                    |-->r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size)

校验阶段

就不讲解 eBPF prog load 过程了,直接看下校验器是怎么处理 bpf2bpf 特性的吧。

  1. JIT 将 eBPF 汇编翻译成机器码
  2. 重新计算指令的 imm 属性
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
|-->bpf_prog_load()
    |-->bpf_check() // run eBPF verifier
        |-->fixup_call_args()
            |-->jit_subprogs() {

                func[i] = bpf_int_jit_compile(func[i]); // 每个平台各自实现 bpf_int_jit_compile 函数

                // ...
                /* at this point all bpf functions were successfully JITed
                * now populate all bpf_calls with correct addresses and
                * run last pass of JIT
                */
                for (i = 0; i < env->subprog_cnt; i++) {
                    insn = func[i]->insnsi;
                    for (j = 0; j < func[i]->len; j++, insn++) {
                        if (insn->code != (BPF_JMP | BPF_CALL) ||
                            insn->src_reg != BPF_PSEUDO_CALL)
                            continue;
                        subprog = insn->off;
                        insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
                                __bpf_call_base; // 重新计算 imm,__bpf_call_base + imm 即是目标函数调用的起始地址
                    }

                    /* we use the aux data to keep a list of the start addresses
                    * of the JITed images for each function in the program
                    *
                    * for some architectures, such as powerpc64, the imm field
                    * might not be large enough to hold the offset of the start
                    * address of the callee's JITed image from __bpf_call_base
                    *
                    * in such cases, we can lookup the start address of a callee
                    * by using its subprog id, available from the off field of
                    * the call instruction, as an index for this list
                    */
                    func[i]->aux->func = func;
                    func[i]->aux->func_cnt = env->subprog_cnt;
                }
                for (i = 0; i < env->subprog_cnt; i++) {
                    old_bpf_func = func[i]->bpf_func;
                    tmp = bpf_int_jit_compile(func[i]);
                    if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
                        verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
                        err = -ENOTSUPP;
                        goto out_free;
                    }
                    cond_resched();
                }
                //...
            }

运行阶段

一切皆已准备好,就看看运行阶段是怎么跑起来的。

  1. 执行 JIT’ed 机器码
  2. 遇到 call 指令,进行一次函数调用
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
// include/linux/filter.h

#define BPF_PROG_RUN(prog, ctx)    ({                       \
    u32 ret;                                                \
    cant_sleep();                                           \
    if (static_branch_unlikely(&bpf_stats_enabled_key)) {   \
        struct bpf_prog_stats *stats;                       \
        u64 start = sched_clock();                          \
        ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);     \
        stats = this_cpu_ptr(prog->aux->stats);             \
        u64_stats_update_begin(&stats->syncp);              \
        stats->cnt++;                                       \
        stats->nsecs += sched_clock() - start;              \
        u64_stats_update_end(&stats->syncp);                \
    } else {                                                \
        ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);     \
    }                                                       \
    ret; })

此时直接解析执行 bpf 汇编指令了。而 bpf_func 属性是在 bpf prog load 的时候就准备好了。

暂且跳过 bpf prog load 过程,直接看看 bpf_func 执行中对 call 指令的处理吧。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
// kernel/bpf/core.c

static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
    // ...

        /* CALL */
        JMP_CALL:
            /* Function call scratches BPF_R1-BPF_R5 registers,
            * preserves BPF_R6-BPF_R9, and stores return value
            * into BPF_R0.
            */
            BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                                BPF_R4, BPF_R5);
            CONT;

    // ...
}

比较简单,直接调用 __bpf_call_base + imm 得到的函数。

小结

有了此次探究 bpf2bpf 特性的经验,后面探究其他特性就方便多了。期待后面更多的特性探索吧。