为了更高的性能,需要将 XDP 程序下沉到网卡驱动里去运行。

因为服务器使用的物理网卡是 Mellanox,所以就研究一下 Mellanox 驱动里是怎么运行 XDP 程序的。

XDP on Mellanox

直接在内核源代码里的 /drivers/net/ethernet/mellanox 目录下搜索 XDP_REDIRECT,就能找到如下代码片段:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c

/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct page *page,
              struct bpf_prog *prog, struct xdp_buff *xdp)
{
    u32 act;
    int err;

    act = bpf_prog_run_xdp(prog, xdp);
    switch (act) {
    case XDP_PASS:
        return false;
    case XDP_TX:
        if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, page, xdp)))
            goto xdp_abort;
        __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
        return true;
    case XDP_REDIRECT:
        /* When XDP enabled then page-refcnt==1 here */
        err = xdp_do_redirect(rq->netdev, xdp, prog);
        if (unlikely(err))
            goto xdp_abort;
        __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
        __set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
        if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
            mlx5e_page_dma_unmap(rq, page);
        rq->stats->xdp_redirect++;
        return true;
    default:
        bpf_warn_invalid_xdp_action(rq->netdev, prog, act);
        fallthrough;
    case XDP_ABORTED:
xdp_abort:
        trace_xdp_exception(rq->netdev, prog, act);
        fallthrough;
    case XDP_DROP:
        rq->stats->xdp_drop++;
        return true;
    }
}

bpf_prog_run_xdp() 就是真实运行 XDP 程序的函数。

XDP_PASS on Mellanox

如果 XDP 程序里 XDP_PASS 该网络包到内核,Mellanox 网卡驱动还做了哪些处理呢?

有 3 个地方调了 mlx5e_xdp_handle() 函数。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

// 第 1 个
static struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
              u32 cqe_bcnt)
{
    // ...
    net_prefetch(data);

    prog = rcu_dereference(rq->xdp_prog);
    if (prog) {
        struct xdp_buff xdp;

        net_prefetchw(va); /* xdp_frame data area */
        mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
        if (mlx5e_xdp_handle(rq, au->page, prog, &xdp))
            return NULL; /* page/packet was consumed by XDP */

        rx_headroom = xdp.data - xdp.data_hard_start;
        metasize = xdp.data - xdp.data_meta;
        cqe_bcnt = xdp.data_end - xdp.data;
    }
    frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
    skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
    // ...

    return skb;
}

// 第 2 个
static struct sk_buff *
mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
                 u32 cqe_bcnt)
{
    // ...
        prog = rcu_dereference(rq->xdp_prog);
    if (prog && mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
        if (test_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
            int i;

            for (i = wi - head_wi; i < rq->wqe.info.num_frags; i++)
                mlx5e_put_rx_frag(rq, &head_wi[i], true);
        }
        return NULL; /* page/packet was consumed by XDP */
    }

    skb = mlx5e_build_linear_skb(rq, xdp.data_hard_start, rq->buff.frame0_sz,
                     xdp.data - xdp.data_hard_start,
                     xdp.data_end - xdp.data,
                     xdp.data - xdp.data_meta);
    // ...

    return skb;
}

// 第 3 个
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
                u16 cqe_bcnt, u32 head_offset, u32 page_idx)
{
    // ...
    net_prefetch(data);

    prog = rcu_dereference(rq->xdp_prog);
    if (prog) {
        struct xdp_buff xdp;

        net_prefetchw(va); /* xdp_frame data area */
        mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
        if (mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
            if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
                __set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
            return NULL; /* page/packet was consumed by XDP */
        }

        rx_headroom = xdp.data - xdp.data_hard_start;
        metasize = xdp.data - xdp.data_meta;
        cqe_bcnt = xdp.data_end - xdp.data;
    }
    frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
    skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
    // ...

    return skb;
}

static inline
struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
                       u32 frag_size, u16 headroom,
                       u32 cqe_bcnt, u32 metasize)
{
    struct sk_buff *skb = build_skb(va, frag_size);

    if (unlikely(!skb)) {
        rq->stats->buff_alloc_err++;
        return NULL;
    }

    skb_reserve(skb, headroom);
    skb_put(skb, cqe_bcnt);

    if (metasize)
        skb_metadata_set(skb, metasize);

    return skb;
}

从上面代码片段可以看出,在执行 XDP 程序之后,都调用 mlx5e_build_linear_skb() 函数来构建 skb

XDP_TX on Mellanox

如果 XDP 程序里 XDP_TX 该网络包发送出去,Mellanox 网卡驱动还做了哪些处理呢?

1
2
3
4
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c

mlx5e_xmit_xdp_buff()
|-->mlx5e_xdpi_fifo_push()

直接在驱动内部调 mlx5e_xdpi_fifo_push() 函数发送出去了。

XDP_REDIRECT on Mellanox

如果 XDP 程序里 XDP_REDIRECT 转发该网络包,Mellanox 网卡驱动还做了哪些处理呢?

驱动里调 xdp_do_redirect() 进行了转发处理。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
// ${KERNEL}/net/core/filter.c

int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
            struct bpf_prog *xdp_prog)
{
    // ...

    return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
                       xdp_prog);
}

static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
                           struct net_device *dev,
                           struct xdp_frame *xdpf,
                           struct bpf_prog *xdp_prog)
{
    // ...
    case BPF_MAP_TYPE_UNSPEC:
        if (map_id == INT_MAX) {
            fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
            if (unlikely(!fwd)) {
                err = -EINVAL;
                break;
            }
            err = dev_xdp_enqueue(fwd, xdpf, dev);
            break;
        }
        fallthrough;
    // ...
}

// ${KERNEL}/kernel/bpf/devmap.c

int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
            struct net_device *dev_rx)
{
    return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}

static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                struct net_device *dev_rx,
                struct bpf_prog *xdp_prog)
{
    int err;

    if (!dev->netdev_ops->ndo_xdp_xmit)
        return -EOPNOTSUPP;

    err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
    if (unlikely(err))
        return err;

    bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
    return 0;
}

// ${KERNEL}/include/net/xdp.h

static inline
int xdp_update_frame_from_buff(struct xdp_buff *xdp,
                   struct xdp_frame *xdp_frame)
{
    int metasize, headroom;

    /* Assure headroom is available for storing info */
    headroom = xdp->data - xdp->data_hard_start;
    metasize = xdp->data - xdp->data_meta;
    metasize = metasize > 0 ? metasize : 0;
    if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
        return -ENOSPC;

    /* Catch if driver didn't reserve tailroom for skb_shared_info */
    if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
        XDP_WARN("Driver BUG: missing reserved tailroom");
        return -ENOSPC;
    }

    xdp_frame->data = xdp->data;
    xdp_frame->len  = xdp->data_end - xdp->data;
    xdp_frame->headroom = headroom - sizeof(*xdp_frame);
    xdp_frame->metasize = metasize;
    xdp_frame->frame_sz = xdp->frame_sz;
    xdp_frame->flags = xdp->flags;

    return 0;
}

/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
{
    struct xdp_frame *xdp_frame;

    if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
        return xdp_convert_zc_to_xdp_frame(xdp);

    /* Store info in top of packet */
    xdp_frame = xdp->data_hard_start;
    if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0))
        return NULL;

    /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
    xdp_frame->mem = xdp->rxq->mem;

    return xdp_frame;
}

以上代码片段的主要处理逻辑:

  1. xdp_buff 转为 xdp_frame
  2. 最后调目标设备的 ndo_xdp_xmit() 函数将 xdp_frame 发送出去。

关于 XDP_REDIRECT 的更多讲解,请看:

XDP_ABORTED and XDP_DROP on Mellanox

Mellanox 驱动对它们没有复杂的处理逻辑:

1
        rq->stats->xdp_drop++;

只是递增了 xdp_drop 统计。

bpf_xdp_adjust_head()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
// ${KERNEL}/net/core/filter.c

BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
    void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
    unsigned long metalen = xdp_get_metalen(xdp);
    void *data_start = xdp_frame_end + metalen;
    void *data = xdp->data + offset;

    if (unlikely(data < data_start ||
             data > xdp->data_end - ETH_HLEN))
        return -EINVAL;

    if (metalen)
        memmove(xdp->data_meta + offset,
            xdp->data_meta, metalen);
    xdp->data_meta += offset;
    xdp->data = data;

    return 0;
}

注意其中一个细节:如果有 metadata,metadata 会被 memmove 而不会被覆盖

bpf_xdp_adjust_meta()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
// ${KERNEL}/net/core/filter.c

BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
    void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
    void *meta = xdp->data_meta + offset;
    unsigned long metalen = xdp->data - meta;

    if (xdp_data_meta_unsupported(xdp))
        return -ENOTSUPP;
    if (unlikely(meta < xdp_frame_end ||
             meta > xdp->data))
        return -EINVAL;
    if (unlikely(xdp_metalen_invalid(metalen)))
        return -EACCES;

    xdp->data_meta = meta;

    return 0;
}

更详细的讲解请看:

bpf_xdp_adjust_tail()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// ${KERNEL}/net/core/filter.c

BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
    void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
    void *data_end = xdp->data_end + offset;

    if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
        if (offset < 0)
            return bpf_xdp_frags_shrink_tail(xdp, -offset);

        return bpf_xdp_frags_increase_tail(xdp, offset);
    }

    // ...

    /* Clear memory area on grow, can contain uninit kernel memory */
    if (offset > 0)
        memset(xdp->data_end, 0, offset);

    xdp->data_end = data_end;

    return 0;
}

Q&A

Q:经过 XDP adjust 后的网络包,能否 PASS 到内核?

A:可以。回头看 XDP_PASS on Mellanox 的处理逻辑,在调 mlx5e_build_linear_skb() 构建 skb 时便处理好了 headmetatail

Q:经过 XDP adjust 后的网络包,在 REDIRECT 时会失去 meta 吗?

A:不会。以 veth 虚拟设备为例,veth 网卡驱动在将 xdp_frame 转为 skb 时,调 skb_metadata_set() 设置 meta 信息(意即,meta 信息可以跨设备传递):

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// ${KERNEL}/net/core/xdp.c

struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                       struct sk_buff *skb,
                       struct net_device *dev)
{
    // ...

    /* Part of headroom was reserved to xdpf */
    headroom = sizeof(*xdpf) + xdpf->headroom;

    /* Memory size backing xdp_frame data already have reserved
     * room for build_skb to place skb_shared_info in tailroom.
     */
    frame_size = xdpf->frame_sz;

    hard_start = xdpf->data - headroom;
    skb = build_skb_around(skb, hard_start, frame_size);
    if (unlikely(!skb))
        return NULL;

    skb_reserve(skb, headroom);
    __skb_put(skb, xdpf->len);
    if (xdpf->metasize)
        skb_metadata_set(skb, xdpf->metasize);

    // ...

    return skb;
}

总结

将 XDP on Mellanox 研究透彻后,就不再害怕将 XDP 程序下发到 Mellanox 驱动去运行的各种 corner case。