想弄清楚 tracepoint 的工作原理,实在太难了;网络上的资料比较少,而且不够深入,甚至是 kernel 文档。

本文尝试从源代码的角度来分析 tracepoint 的工作原理。

抛砖引玉,欢迎大家指正。

tracepoint demo

在 eBPF 里使用 tracepoint,是比较简单的。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
struct netlink_extack_error_ctx {
    unsigned long unused;

    __u32 msg; // __data_loc char[] msg;
};

SEC("tp/netlink/netlink_extack")
int tp__netlink_extack(struct netlink_extack_error_ctx *ctx)
{
    char *msg = (void *)(__u64) ((void *) ctx + (__u64) ((ctx->msg) & 0xFFFF));

    __output_msg(ctx, msg, PROBE_TYPE_DEFAULT, 0);

    return 0;
}

其中,需要自定义 ctx 结构体,这是 bpf 里的做法。

ctx 结构体的第一个属性必须是 unsigned long unused,而且不能在 bpf 里使用,这是预留给 tracepoint 自身使用的字段。

P.S. demo 源代码:GitHub Asphaltt/learn-by-example/ebpf/tracepoint

如何确定 tracepointctx 结构体的其它字段信息呢?

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# cat /sys/kernel/debug/tracing/events/netlink/netlink_extack/format
name: netlink_extack
ID: 1568
format:
    field:unsigned short common_type;   offset:0;   size:2; signed:0;
    field:unsigned char common_flags;   offset:2;   size:1; signed:0;
    field:unsigned char common_preempt_count;   offset:3;   size:1; signed:0;
    field:int common_pid;   offset:4;   size:4; signed:1;

    field:__data_loc char[] msg;    offset:8;   size:4; signed:0;

print fmt: "msg=%s", __get_str(msg)

# bpftrace -lv 'tracepoint:netlink:netlink_extack'
tracepoint:netlink:netlink_extack
    __data_loc char[] msg

通过上面两中方式,可以得到 tracepointctx 结构体的其它字段信息:

  1. msg 字段的类型是 __data_loc char[],即 char *

不过,msg 字段的真实类型真的是 char * 吗?答案请查看:

不过,话说回来,cat /sys/kernel/debug/tracing/events/netlink/netlink_extack/format 里已说明该字段的详细信息:

1
2
3
4
5
6
    field:__data_loc char[] msg;    offset:8;   size:4; signed:0;

# offset:该字段在 `ctx` 结构体中的偏移量;
# size:该字段的大小;
# signed:该字段是否是有符号的。
# 最终在 ctx struct 里,该字段的类型是 `__u32`,而不是 `char *`。

而在 Go 代码里,只需要:

1
2
3
4
5
6
7
    if tp, err := link.Tracepoint("netlink", "netlink_extack", obj.TpNetlinkExtack, nil); err != nil {
        log.Printf("Failed to attach tracepoint(netlink_extack): %v", err)
        return
    } else {
        log.Printf("Attached to tracepoint(netlink_extack)")
        defer tp.Close()
    }

即可将 tracepoint 程序 attach 到 tracepoint 上。

tracepoint 定义

在内核里,一个 tracepoint 是怎么定义出来的呢?

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
// ${KERNEL}/include/trace/events/netlink.h

TRACE_EVENT(netlink_extack,

    TP_PROTO(const char *msg),

    TP_ARGS(msg),

    TP_STRUCT__entry(
        __string(   msg,    msg )
    ),

    TP_fast_assign(
        __assign_str(msg, msg);
    ),

    TP_printk("msg=%s", __get_str(msg))
);

将这些宏一层一层打开来看:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
TP_PROTO(const char *msg)
// 宏定义: // ${KERNEL}/include/linux/tracepoint.h
#define TP_PROTO(args...)   args

TP_ARGS(msg)
// 宏定义: // ${KERNEL}/include/linux/tracepoint.h
#define TP_ARGS(args...)    args

TP_STRUCT__entry(               // 定义 entry struct
    __string(   msg,    msg )
)
// 宏定义: // ${KERNEL}/include/trace/stages/stage1_struct_define.h
#define TP_STRUCT__entry(args...) args

__string(   msg,    msg )
// 宏定义: // ${KERNEL}/include/trace/stages/stage1_struct_define.h
#define __dynamic_array(type, item, len) u32 __data_loc_##item;
#define __string(item, src) __dynamic_array(char, item, -1)

TP_fast_assign(
    __assign_str(msg, msg);
)
// 宏定义: // ${KERNEL}/include/trace/stages/stage6_event_callback.h
#define TP_fast_assign(args...) args

__assign_str(msg, msg)
// 宏定义: // ${KERNEL}/include/trace/stages/stage6_event_callback.h
#define __assign_str(dst, src)                      \
    strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)");

TP_printk("msg=%s", __get_str(msg))
// 宏定义: // ${KERNEL}/include/trace/stages/stage3_trace_output.h
#define TP_printk(fmt, args...) fmt "\n", args

// *******************************

TRACE_EVENT(netlink_extack,

    TP_PROTO(const char *msg),

    TP_ARGS(msg),

    TP_STRUCT__entry(
        __string(   msg,    msg )
    ),

    TP_fast_assign(
        __assign_str(msg, msg);
    ),

    TP_printk("msg=%s", __get_str(msg))
);
// 宏定义: // ${KERNEL}/include/trace/trace_events.h
#define TRACE_EVENT(name, proto, args, tstruct, assign, print)  \
    DECLARE_EVENT_CLASS(name,                                   \
                 PARAMS(proto),                                 \
                 PARAMS(args),                                  \
                 PARAMS(tstruct),                               \
                 PARAMS(assign),                                \
                 PARAMS(print));                                \
    DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));

// DECLARE_EVENET_CLASS() 宏定义:  // ${KERNEL}/include/trace/perf.h
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
static notrace void                                                     \
perf_trace_##call(void *__data, proto)                                  \
{                                                                       \
    struct trace_event_call *event_call = __data;                       \
    struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
    struct trace_event_raw_##call *entry;                               \
    struct pt_regs *__regs;                                             \
    u64 __count = 1;                                                    \
    struct task_struct *__task = NULL;                                  \
    struct hlist_head *head;                                            \
    int __entry_size;                                                   \
    int __data_size;                                                    \
    int rctx;                                                           \
                                                                        \
    __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
                                                                        \
    head = this_cpu_ptr(event_call->perf_events);                       \
    if (!bpf_prog_array_valid(event_call) &&                            \
        __builtin_constant_p(!__task) && !__task &&                     \
        hlist_empty(head))                                              \
        return;                                                         \
                                                                        \
    __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),    \
                 sizeof(u64));                                          \
    __entry_size -= sizeof(u32);                                        \
                                                                        \
    entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);         \
    if (!entry)                                                         \
        return;                                                         \
                                                                        \
    perf_fetch_caller_regs(__regs);                                     \
                                                                        \
    tstruct                                                             \
                                                                        \
    { assign; }                                                         \
                                                                        \
    perf_trace_run_bpf_submit(entry, __entry_size, rctx,                \
                  event_call, __count, __regs,                          \
                  head, __task);                                        \
}

// DEFINE_EVENT() 宏定义:  // ${KERNEL}/include/linux/tracepoint.h
#define DEFINE_EVENT(template, name, proto, args)       \
    DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
// 继续:
#define DECLARE_TRACE(name, proto, args)                \
    __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),      \
            cpu_online(raw_smp_processor_id()),     \
            PARAMS(void *__data, proto))
// 继续:
#define __DECLARE_TRACE(name, proto, args, cond, data_proto)            \
    extern int __traceiter_##name(data_proto);                          \
    DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);            \
    extern struct tracepoint __tracepoint_##name;                       \
    static inline void trace_##name(proto)                              \
    {                                                                   \
        if (static_key_false(&__tracepoint_##name.key))                 \
            __DO_TRACE(name,                                            \
                TP_ARGS(args),                                          \
                TP_CONDITION(cond), 0);                                 \
        if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {                     \
            WARN_ON_ONCE(!rcu_is_watching());                           \
        }                                                               \
    }                                                                   \
    __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),              \
                PARAMS(cond))                                           \
    static inline int                                                   \
    register_trace_##name(void (*probe)(data_proto), void *data)        \
    {                                                                   \
        return tracepoint_probe_register(&__tracepoint_##name,          \
                        (void *)probe, data);                           \
    }                                                                   \
    static inline int                                                   \
    register_trace_prio_##name(void (*probe)(data_proto), void *data,   \
                   int prio)                                            \
    {                                                                   \
        return tracepoint_probe_register_prio(&__tracepoint_##name,     \
                          (void *)probe, data, prio);                   \
    }                                                                   \
    static inline int                                                   \
    unregister_trace_##name(void (*probe)(data_proto), void *data)      \
    {                                                                   \
        return tracepoint_probe_unregister(&__tracepoint_##name,        \
                        (void *)probe, data);                           \
    }                                                                   \
    static inline void                                                  \
    check_trace_callback_type_##name(void (*cb)(data_proto))            \
    {                                                                   \
    }                                                                   \
    static inline bool                                                  \
    trace_##name##_enabled(void)                                        \
    {                                                                   \
        return static_key_false(&__tracepoint_##name.key);              \
    }
// __DO_TRACE() 的宏定义:
#define __DO_TRACE(name, args, cond, rcuidle)                           \
    do {                                                                \
        int __maybe_unused __idx = 0;                                   \
                                                                        \
        if (!(cond))                                                    \
            return;                                                     \
                                                                        \
        if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle)))                        \
            return;                                                     \
                                                                        \
        /* keep srcu and sched-rcu usage consistent */                  \
        preempt_disable_notrace();                                      \
                                                                        \
        /*                                                              \
         * For rcuidle callers, use srcu since sched-rcu                \
         * doesn't work from the idle path.                             \
         */                                                             \
        if (rcuidle) {                                                  \
            __idx = srcu_read_lock_notrace(&tracepoint_srcu);           \
            ct_irq_enter_irqson();                                      \
        }                                                               \
                                                                        \
        __DO_TRACE_CALL(name, TP_ARGS(args));                           \
                                                                        \
        if (rcuidle) {                                                  \
            ct_irq_exit_irqson();                                       \
            srcu_read_unlock_notrace(&tracepoint_srcu, __idx);          \
        }                                                               \
                                                                        \
        preempt_enable_notrace();                                       \
    } while (0)
// __DO_TRACE_CALL() 的宏定义:
#define __DO_TRACE_CALL(name, args)                                     \
    do {                                                                \
        struct tracepoint_func *it_func_ptr;                            \
        void *__data;                                                   \
        it_func_ptr =                                                   \
            rcu_dereference_raw((&__tracepoint_##name)->funcs);         \
        if (it_func_ptr) {                                              \
            __data = (it_func_ptr)->data;                               \
            static_call(tp_func_##name)(__data, args);                  \
        }                                                               \
    } while (0)



#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
_TRACE_PERF_PROTO(call, PARAMS(proto));                                 \
static char print_fmt_##call[] = print;                                 \
static struct trace_event_class __used __refdata event_class_##call = { \
    .system         = TRACE_SYSTEM_STRING,                              \
    .fields_array   = trace_event_fields_##call,                        \
    .fields         = LIST_HEAD_INIT(event_class_##call.fields),        \
    .raw_init       = trace_event_raw_init,                             \
    .probe          = trace_event_raw_event_##call,                     \
    .reg            = trace_event_reg,                                  \
    _TRACE_PERF_INIT(call)                                              \
};

#define _TRACE_PERF_PROTO(call, proto)                                  \
    static notrace void                                                 \
    perf_trace_##call(void *__data, proto);

#define _TRACE_PERF_INIT(call)                                          \
    .perf_probe     = perf_trace_##call,

/* 烂尾了 */

至此,tracepoint 的定义基本明了。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
// ${KERNEL}/net/netlink/af_netlink.c

void do_trace_netlink_extack(const char *msg)
{
    trace_netlink_extack(msg);
}

trace_netlink_extack()
|-->static_call(tp_func_netlink_extack)(__data, args); // DECLARE_TRACE() -> __DECLARE_TRACE() -> __DO_TRACE() -> __DO_TRACE_CALL()
     *
      * (未知调用链)
       *
        perf_trace_netlink_extack()
        |-->perf_trace_run_bpf_submit()         // ${KERNEL}/kernel/events/core.c
            |-->trace_call_bpf()                // ${KERNEL}/kernel/trace/bpf_trace.c
                |-->bpf_prog_run_array(rcu_dereference(call->prog_array), ctx, bpf_prog_run);   // ${KERNEL}/include/linux/bpf.h

bpf attach 到 tracepoint

从 Go 代码出发,如何将 bpf attach 到 tracepoint 上呢?

1
2
3
4
5
Tracepoint()                                    // ${cilium/ebpf}/link/tracepoint.go
|-->attachPerfEvent()                           // ${cilium/ebpf}/link/perf_event.go
    |-->attachPerfEventLink()
        |-->LinkCreatePerfEvent()               // ${cilium/ebpf}/internal/types.go
            |--> BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))

接着看内核对应的源代码:

1
2
3
4
5
6
7
8
9
__sys_bpf()                                     // ${KERNEL}/kernel/bpf/syscall.c
|-->link_create()
    |-->bpf_perf_link_attach()
        |-->perf_event_set_bpf_prog()           // ${KERNEL}/kernel/events/core.c
            |-->perf_event_attach_bpf_prog() {  // ${KERNEL}/kernel/trace/bpf_trace.c
                    old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
                    bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
                    rcu_assign_pointer(event->tp_event->prog_array, new_array);
                }

好吧,即使看到这里,只是大概弄明白了 tracepoint bpf 程序是怎么跑起来的,但还是不知道 tracepoint 的工作原理。

小结

本文尝试从源代码的角度来分析 tracepoint 的工作原理;不过尝试并未成功。