eBPF Talk: trace XDP 程序,使用 bpf 也是能够对 tc-bpf 程序进行 trace 的。

trace tc-bpf 程序的 demo

demo 效果如下:

1
2
3
4
5
6
7
8
# ./fentry_fexit-tc -d enp0s8
2023/07/16 23:34:11 Attached fentry(tc)
2023/07/16 23:34:11 Attached fexit(tc)
2023/07/16 23:34:11 Listening events...
2023/07/16 23:34:13 Tracing packet: 192.168.1.12 -> 192.168.1.138 (fentry)
2023/07/16 23:34:13 Tracing packet: 192.168.1.12 -> 192.168.1.138 (fexit: TC_ACT_OK)
2023/07/16 23:34:14 Tracing packet: 192.168.1.12 -> 192.168.1.138 (fentry)
2023/07/16 23:34:14 Tracing packet: 192.168.1.12 -> 192.168.1.138 (fexit: TC_ACT_OK)

其中使用的 trace 手段是 fentryfexit

demo 使用的 bpf 代码如下:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
enum probing_type {
    PROBE_TYPE_DEFAULT = 0,
    PROBE_TYPE_FENTRY,
    PROBE_TYPE_FEXIT,
    PROBE_TYPE_FREPLACE,
};

typedef struct event {
    __be32 saddr, daddr;
    __u8 probe_type;
    __u8 verdict;
    __u16 pad;
} __attribute__((packed)) event_t;

struct {
    __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    __uint(key_size, 4);
    __uint(value_size, 4);
} events SEC(".maps");

static __always_inline void
__handle_packet(void *ctx, struct iphdr *iph, enum probing_type type, int verdict)
{
    event_t ev = {};
    ev.saddr = BPF_CORE_READ(iph, saddr);
    ev.daddr = BPF_CORE_READ(iph, daddr);
    ev.probe_type = (__u8)type;
    ev.verdict = (__u8)verdict;

    bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &ev, sizeof(ev));
}

static __always_inline void
handle_tc(void *ctx, struct sk_buff *skb, enum probing_type type, int verdict)
{
    void *head = (void *)(long)BPF_CORE_READ(skb, head);
    __u16 l2_off = BPF_CORE_READ(skb, mac_header);
    __u16 l3_off = BPF_CORE_READ(skb, network_header);
    struct ethhdr *eth = head + l2_off;
    struct iphdr *iph = head + l3_off;

    if (BPF_CORE_READ(eth, h_proto) != bpf_htons(ETH_P_IP))
        return;

    if (BPF_CORE_READ(iph, protocol) != IPPROTO_ICMP)
        return;

    __handle_packet(ctx, iph, type, verdict);
}

SEC("fentry/tc")
int BPF_PROG(fentry_tc, struct sk_buff *skb)
{
    handle_tc(ctx, skb, PROBE_TYPE_FENTRY, 0);
    return 0;
}

SEC("fexit/tc")
int BPF_PROG(fexit_tc, struct sk_buff *skb, int verdict)
{
    handle_tc(ctx, skb, PROBE_TYPE_FEXIT, verdict);
    return 0;
}

对下面 tc-bpf 程序进行 trace 的时候,

1
2
3
4
5
SEC("tc")
int dummy(struct __sk_buff *skb)
{
    return TC_ACT_OK;
}

用户态的 Go 代码需要做的事情是:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
    tcDummy := spec.Programs["dummy"]
    dummyProg, err := ebpf.NewProgram(tcDummy)
    if err != nil {
        log.Fatalf("Failed to create dummy program: %v", err)
    }
    defer dummyProg.Close()

    tcFentry := spec.Programs["fentry_tc"]
    tcFentry.AttachTarget = dummyProg
    tcFentry.AttachTo = "dummy"
    tcFexit := spec.Programs["fexit_tc"]
    tcFexit.AttachTarget = dummyProg
    tcFexit.AttachTo = "dummy"
  1. 第一步,创建 tc-bpf 程序。
  2. 第二步,给 fentryfexit 程序指定 AttachTargetAttachTo
  3. 其中,AttachTarget 是 tc-bpf 程序,AttachTo 是 tc-bpf 程序中的函数名。
  4. 即,将 fentryfexit 程序 attach 到 tc-bpf 程序的 dummy 函数上。

P.S. demo 源代码:GitHub Asphaltt/learn-by-example/ebpf/fentry_fexit-tc

fentry/fexit 的函数参数

仔细对比上面 fentry/fexit 的函数定义和 tc-bpf 程序的函数定义:

1
2
3
4
5
6
7
8
SEC("fentry/tc")
int BPF_PROG(fentry_tc, struct sk_buff *skb);

SEC("fexit/tc")
int BPF_PROG(fexit_tc, struct sk_buff *skb, int verdict);

SEC("tc")
int dummy(struct __sk_buff *skb);

为什么 fentry/fexit 的函数参数是 struct sk_buff *skb,而 tc-bpf 程序的函数参数是 struct __sk_buff *skb 呢?

fentry/fexit 的函数参数里不能再使用 ctx

这是因为 BPF_PROG() 宏里已默认提供了 ctx 参数,所以不能再使用 ctx 参数名了。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
#define BPF_PROG(name, args...)                                     \
name(unsigned long long *ctx);                                      \
static __always_inline typeof(name(0))                              \
____##name(unsigned long long *ctx, ##args);                        \
typeof(name(0)) name(unsigned long long *ctx)                       \
{                                                                   \
    _Pragma("GCC diagnostic push")                                  \
    _Pragma("GCC diagnostic ignored \"-Wint-conversion\"")          \
    return ____##name(___bpf_ctx_cast(args));                       \
    _Pragma("GCC diagnostic pop")                                   \
}                                                                   \
static __always_inline typeof(name(0))                              \
____##name(unsigned long long *ctx, ##args)

fentry/fexit 的函数参数类型用的是 struct sk_buff * 而不是 struct __sk_buff *

在理解了 fentry/fexit 的实现原理后,就知道 fentry/fexit 程序接受的参数是目标函数真实的参数,而非 tc-bpf 程序所使用的参数。

所以,fentry/fexit 程序的目标函数是 tc-bpf 程序里的入口函数时,fentry/fexit 程序接受的参数是 struct sk_buff *

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// ${KERNEL}/net/sched/cls_bpf.c


TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb,
                       const struct tcf_proto *tp,
                       struct tcf_result *res)
{
    // ...

    list_for_each_entry_rcu(prog, &head->plist, link) {
        int filter_res;

        qdisc_skb_cb(skb)->tc_classid = prog->res.classid;

        if (tc_skip_sw(prog->gen_flags)) {
            filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
        } else if (at_ingress) {
            /* It is safe to push/pull even if skb_shared() */
            __skb_push(skb, skb->mac_len);
            bpf_compute_data_pointers(skb);
            filter_res = bpf_prog_run(prog->filter, skb);
            __skb_pull(skb, skb->mac_len);
        } else {
            bpf_compute_data_pointers(skb);
            filter_res = bpf_prog_run(prog->filter, skb);
        }
        // ...

        break;
    }

    return ret;
}

// ${KERNEL}/include/linux/filter.h

static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
{
    return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
}

运行 tc-bpf 程序时传的参数是 struct sk_buff *skb

tc-bpf 程序的入参为什么是 struct __sk_buff * 而不是 struct sk_buff *

其实,这是一个约定,约定 tc-bpf 程序的参数是 struct __sk_buff *。而在运行的时候,从实参中读取对应的属性。

简略地对比一下 struct __sk_buffstruct sk_buff 的定义:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
struct __sk_buff {
    __u32 len;
    __u32 pkt_type;
    __u32 mark;
    __u32 queue_mapping;
    // ...
    __u32 ifindex;
    __u32 tc_index;
    __u32 cb[5];
    __u32 hash;
    __u32 tc_classid;
    __u32 data;
    __u32 data_end;
    // ...
    __u32 data_meta;
    // ...
};

struct sk_buff {
    union {
        struct {
            // ...
            union {
                struct net_device   *dev;
                // ...
            };
        };
        // ...
    };

    // ...

    char            cb[48] __aligned(8);

    // ...

    unsigned int        len,
                data_len;
    __u16           mac_len,
                hdr_len;

    __u16           queue_mapping;

    // ...

    union {
        __u32       mark;
        __u32       reserved_tailroom;
    };

   // ...
};

那么,在运行的时候是怎么从实参中读取对应的属性的呢?

其实在 verifier 阶段,就将 struct __sk_buff * 的属性访问替换成对应的 struct sk_buff * 的属性访问了。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// ${KERNEL}/net/core/filter.c

const struct bpf_verifier_ops sk_filter_verifier_ops = {
    .get_func_proto     = sk_filter_func_proto,
    .is_valid_access    = sk_filter_is_valid_access,
    .convert_ctx_access = bpf_convert_ctx_access,
    .gen_ld_abs         = bpf_gen_ld_abs,
};


static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                  const struct bpf_insn *si,
                  struct bpf_insn *insn_buf,
                  struct bpf_prog *prog, u32 *target_size)
{
    struct bpf_insn *insn = insn_buf;
    int off;

    switch (si->off) {
    case offsetof(struct __sk_buff, len):
        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                      bpf_target_off(struct sk_buff, len, 4,
                             target_size));
        break;

    // ...

    case offsetof(struct __sk_buff, ingress_ifindex):
        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                      bpf_target_off(struct sk_buff, skb_iif, 4,
                             target_size));
        break;

    case offsetof(struct __sk_buff, ifindex):
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                      si->dst_reg, si->src_reg,
                      offsetof(struct sk_buff, dev));
        *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                      bpf_target_off(struct net_device, ifindex, 4,
                             target_size));
        break;

    case offsetof(struct __sk_buff, hash):
        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                      bpf_target_off(struct sk_buff, hash, 4,
                             target_size));
        break;

    case offsetof(struct __sk_buff, mark):
        if (type == BPF_WRITE)
            *insn++ = BPF_EMIT_STORE(BPF_W, si,
                         bpf_target_off(struct sk_buff, mark, 4,
                                target_size));
        else
            *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                          bpf_target_off(struct sk_buff, mark, 4,
                                 target_size));
        break;

    case offsetof(struct __sk_buff, pkt_type):
        *target_size = 1;
        *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                      PKT_TYPE_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
        *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
        break;

    case offsetof(struct __sk_buff, queue_mapping):
        if (type == BPF_WRITE) {
            u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);

            if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
                *insn++ = BPF_JMP_A(0); /* noop */
                break;
            }

            if (BPF_CLASS(si->code) == BPF_STX)
                *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
            *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
        } else {
            *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                          bpf_target_off(struct sk_buff,
                                 queue_mapping,
                                 2, target_size));
        }
        break;

    // ...

    case offsetof(struct __sk_buff, cb[0]) ...
         offsetofend(struct __sk_buff, cb[4]) - 1:
        BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
        BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                  offsetof(struct qdisc_skb_cb, data)) %
                 sizeof(__u64));

        prog->cb_access = 1;
        off  = si->off;
        off -= offsetof(struct __sk_buff, cb[0]);
        off += offsetof(struct sk_buff, cb);
        off += offsetof(struct qdisc_skb_cb, data);
        if (type == BPF_WRITE)
            *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
        else
            *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                          si->src_reg, off);
        break;

    // ...

    case offsetof(struct __sk_buff, data):
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                      si->dst_reg, si->src_reg,
                      offsetof(struct sk_buff, data));
        break;

    case offsetof(struct __sk_buff, data_meta):
        off  = si->off;
        off -= offsetof(struct __sk_buff, data_meta);
        off += offsetof(struct sk_buff, cb);
        off += offsetof(struct bpf_skb_data_end, data_meta);
        *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                      si->src_reg, off);
        break;

    case offsetof(struct __sk_buff, data_end):
        off  = si->off;
        off -= offsetof(struct __sk_buff, data_end);
        off += offsetof(struct sk_buff, cb);
        off += offsetof(struct bpf_skb_data_end, data_end);
        *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                      si->src_reg, off);
        break;

    // ...
    }

    return insn - insn_buf;
}

小结

  • fentry/fexit 的函数参数是目标函数真实的参数,而非 tc-bpf 程序所使用的参数。
  • fexit 能够获取到 tc-bpf 程序的返回值。
  • tc-bpf 程序的入参为什么是 struct __sk_buff *?这是一个约定,约定 tc-bpf 程序的参数是 struct __sk_buff *。而在运行的时候,从实参中读取对应的属性。
  • 在 verifier 阶段,就将 struct __sk_buff * 的属性访问替换成对应的 struct sk_buff * 的属性访问了。