实现了对 IRQ 绑核、RPS/XPS 配置变更后,接着是跟踪网卡的 net.ipv4.conf.*.* sysctl 的配置变更。

ipv4 sysctl 配置变更的内核函数

ipv4 sysctl 配置变更的方式如下:

  1. echo 1 > /proc/sys/net/ipv4/conf/all/${devconf}
  2. echo 1 > /proc/sys/net/ipv4/conf/default/${devconf}
  3. echo 1 > /proc/sys/net/ipv4/conf/${NET_DEV}/${devconf}

直接查看对应的内核源代码 net/ipv4/devinet.c 吧。

找到如下源代码:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// https://github.com/torvalds/linux/blob/master/net/ipv4/devinet.c

static int devinet_conf_proc(struct ctl_table *ctl, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
    int old_value = *(int *)ctl->data;
    int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
    int new_value = *(int *)ctl->data;

    if (write) {
        struct ipv4_devconf *cnf = ctl->extra1;
        struct net *net = ctl->extra2;
        int i = (int *)ctl->data - cnf->data;
        int ifindex;

        set_bit(i, cnf->state);

        if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
            new_value != old_value) {
            ifindex = devinet_conf_ifindex(net, cnf);
            inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                        NETCONFA_RP_FILTER,
                                        ifindex, cnf);
        }

        // ...
        }
    }

    return ret;
}

static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
                                  void *buffer, size_t *lenp, loff_t *ppos)
{
    // ...

    ret = proc_dointvec(ctl, write, buffer, lenp, ppos);

    if (write && *valp != val) {
        // ...
    }

    return ret;
}

static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
    // ...

    if (write && *valp != val)
        rt_cache_flush(net);

    return ret;
}

static struct devinet_sysctl_table {
    struct ctl_table_header *sysctl_header;
    struct ctl_table devinet_vars[IPV4_DEVCONF_MAX];
} devinet_sysctl = {
    .devinet_vars = {
        DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
                         devinet_sysctl_forward),
        DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
        DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),

        // ...

        DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
        // ...
    },
};

使用 bpftrace 确认一下:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# bpftrace -l 'k:devinet_conf_proc'
kprobe:devinet_conf_proc

# bpftrace -l 'k:devinet_sysctl_forward'
kprobe:devinet_sysctl_forward

# bpftrace -l 'k:ipv4_doint_and_flush'
kprobe:ipv4_doint_and_flush

# bpftrace -e 'k:devinet_conf_proc, k:devinet_sysctl_forward, k:ipv4_doint_and_flush { printf("ipv4 sysctl: %s\n", comm); }'
Attaching 3 probes...

分析其中的 devinet_conf_proc 函数:

  1. sysctl 配置变更的值是通过 ctl->data 获取的,类型是 int
  2. sysctl 配置项的索引通过 (int *)ctl->data - cnf->data 获取的;需要 +1 后才是正确的索引定义。
  3. sysctl 配置的 ifindex 是通过 devinet_conf_ifindex() 获取的,可能是 ALL/DEFAULT/ifindex

跟踪 ipv4 sysctl 配置变更函数

直接对 devinet_conf_procdevinet_sysctl_forwardipv4_doint_and_flush 进行 fexit 跟踪,以便观测配置变更的情况。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
static __always_inline int
__fexit(void *ctx, struct ctl_table *ctl, int write, void *buffer, size_t *lenp,
        loff_t *ppos, int retval)
{
    struct ipv4_devconf *cnf;
    struct event event = {};
    struct net *net;

    if (retval || !write)
        return BPF_OK;

    // Only interested in writing devinet conf.

    cnf = (typeof(cnf)) BPF_CORE_READ(ctl, extra1);
    net = (typeof(net)) BPF_CORE_READ(ctl, extra2);

    event.ifindex = devinet_conf_ifindex(net, cnf);
    event.cnf_data_ptr = ((__u64) cnf) + offsetof(struct ipv4_devconf, data);
    event.ctl_data_ptr = (__u64) BPF_CORE_READ(ctl, data);
    bpf_probe_read_kernel(&event.devconf_value, sizeof(event.devconf_value),
                          BPF_CORE_READ(ctl, data));

    handle_event(ctx, &event);

    return BPF_OK;
}

SEC("fexit/devinet_conf_proc")
int BPF_PROG(fexit_devinet_conf_proc, struct ctl_table *ctl, int write,
             void *buffer, size_t *lenp, loff_t *ppos, int retval)
{
    return __fexit(ctx, ctl, write, buffer, lenp, ppos, retval);
}

SEC("fexit/ipv4_doint_and_flush")
int BPF_PROG(fexit_ipv4_doint_and_flush, struct ctl_table *ctl, int write,
             void *buffer, size_t *lenp, loff_t *ppos, int retval)
{
    return __fexit(ctx, ctl, write, buffer, lenp, ppos, retval);
}

SEC("fexit/devinet_sysctl_forward")
int BPF_PROG(fexit_devinet_sysctl_forward, struct ctl_table *ctl, int write,
             void *buffer, size_t *lenp, loff_t *ppos, int retval)
{
    return __fexit(ctx, ctl, write, buffer, lenp, ppos, retval);
}

其中,不要傻傻地在 bpf 里计算配置项索引,而留待用户态程序计算。

跑起来后:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
$ sudo echo 0 > /proc/sys/net/ipv4/conf/all/forwarding
$ sudo echo 0 > /proc/sys/net/ipv4/conf/default/forwarding

# or

$ sudo sysctl -w net.ipv4.conf.all.forwarding=0
net.ipv4.conf.all.forwarding = 0
$ sudo sysctl -w net.ipv4.conf.default.forwarding=0
net.ipv4.conf.default.forwarding = 0

$ sudo ./fexit_ipv4_sysctl
2024/07/07 13:35:21 Attached fexit(devinet_conf_proc)
2024/07/07 13:35:21 Attached fexit(ipv4_doint_and_flush)
2024/07/07 13:35:21 Attached fexit(devinet_sysctl_forward)
2024/07/07 13:35:23 Update forwarding to 0 on interface ALL(-1) by process /usr/bin/zsh
2024/07/07 13:35:26 Update forwarding to 0 on interface DEFAULT(-2) by process /usr/bin/zsh

完整的源代码:fexit_ipv4_sysctl

总结

跟踪 ipv4 sysctl 配置变更的方式和跟踪 RPS/XPS 配置变更的方式类似,使用 fexit 而不是 kprobe

在 bpf 代码里,需要注意 ifindex 的获取,以及配置项索引的计算。