Linux内核在v5.2支持了eBPF全局变量(见v5.2)。

全局变量会被Clang编译器放入.bss, .data, .rodata sections。那么支持全局变量即如何把这些sections中的数据加载进内核,并且在relocation时,给访问这些变量的指令正确的地址。

早期 workaround#

关于 eBPF 静态变量支持的讨论在Linux Plumbers Conference 2018 由Cilium提出。早期的 workaround 如下:

#include <linux/bpf.h>

typedef unsigned int __u32;
typedef long long unsigned int __64;

#ifndef __section
# define __section(NAME)				\
	__attribute__((section(NAME), used))
#endif
#ifndef __fetch
# define __fetch(x) (__u32)(__u64)(&(x))
#endif

__u32 foo = 42; 			// .data section
// __u32 foo;   			// .bss section
// const __u32 foo = 42; 	// .rodata section

int __main(struct __sk_buff *skb)
{
    skb->mark = __fetch(foo);
    return 0;
}

char __license[] __section("license") = "";

编译后,foo变量存储在.data section内。

# llvm-readelf -S test.o
There are 8 section headers, starting at offset 0x148:

Section Headers:
  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
  [ 1] .strtab           STRTAB          0000000000000000 0000fa 00004b 00      0   0  1
  [ 2] .text             PROGBITS        0000000000000000 000040 000028 00  AX  0   0  8
  [ 3] .rel.text         REL             0000000000000000 0000e8 000010 10      7   2  8
  [ 4] .data             PROGBITS        0000000000000000 000068 000004 00  WA  0   0  4
  [ 5] license           PROGBITS        0000000000000000 00006c 000001 00  WA  0   0  1
  [ 6] .llvm_addrsig     LLVM_ADDRSIG    0000000000000000 0000f8 000002 00   E  7   0  1
  [ 7] .symtab           SYMTAB          0000000000000000 000070 000078 18      1   2  8
Key to Flags:
  W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
  L (link order), O (extra OS processing required), G (group), T (TLS),
  C (compressed), x (unknown), o (OS specific), E (exclude),
  p (processor specific)

.rel.text section中也存在foo变量的relocation信息:

# llvm-readelf -r test.o

Relocation section '.rel.text' at offset 0xe8 contains 1 entries:
    Offset             Info             Type               Symbol's Value  Symbol's Name
0000000000000000  0000000400000001 R_BPF_64_64            0000000000000000 foo

# llvm-readelf -s test.o

Symbol table '.symtab' contains 5 entries:
   Num:    Value          Size Type    Bind   Vis       Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
     1: 0000000000000000     0 FILE    LOCAL  DEFAULT   ABS test.c
     2: 0000000000000000     1 OBJECT  GLOBAL DEFAULT     5 __license
     3: 0000000000000000    40 FUNC    GLOBAL DEFAULT     2 __main
     4: 0000000000000000     4 OBJECT  GLOBAL DEFAULT     4 foo

接下来的工作就是让 loader 识别.data, .rodata, .bss sections 中的数据,并完成relocation。Cilium修改了 iproute2 的实现,添加了对.data.bss sections 中数据的 relocation 支持:

// https://github.com/isovalent/iproute2/blob/static-data/lib/bpf.c#L2543
static int bpf_apply_relo_glob(struct bpf_elf_ctx *ctx, struct bpf_elf_prog *prog,
			       GElf_Rel *relo, GElf_Sym *sym,
			       struct bpf_relo_props *props)
{
	unsigned int insn_off = relo->r_offset / sizeof(struct bpf_insn);
	int *data;

	if (insn_off >= prog->insns_num)
		return -EINVAL;

	data = ctx->glo_data->d_buf + sym->st_value;
	prog->insns[insn_off].imm = *data;
	return 0;
}

static int bpf_apply_relo_bss(struct bpf_elf_ctx *ctx, struct bpf_elf_prog *prog,
			      GElf_Rel *relo, GElf_Sym *sym,
			      struct bpf_relo_props *props)
{
	unsigned int insn_off = relo->r_offset / sizeof(struct bpf_insn);

	if (insn_off >= prog->insns_num)
		return -EINVAL;

	prog->insns[insn_off].imm = 0;
	return 0;
}

可以看到,relocation做的操作就是把实际的数据存到imm字段中。imm字段长度为32位,这也是这种方式的限制:只支持能放入imm字段中的简单数据类型,对复杂的结构体和数组不太适用。 优点是能适用于老内核,只需要修改用户态bpf loader的实现。

libbpf相应的修改patch:https://patchwork.ozlabs.org/project/netdev/cover/20190212004729.535-1-joe@wand.net.nz/

当前的支持方式#

在会上的讨论中,一位参与者提出了利用bpf map来实现eBPF全局变量。这也是现在eBPF全局变量的实现方式。

patch:[PATCH bpf-next v2 0/7] BPF support for global data

主要的修改也集中在BPF loader,基本思路如下:

  1. bpf_object__elf_collect()收集.data, .rodata.bsssections的信息
  2. 如果存在上述sections,调用bpf_object__init_global_maps()创建一个section对应的array map。每个array map仅有一个entry,map的value size为对应section的大小。如果是.rodatasection,设置map为read-only。创建完成后,对于.data.rodatasection,将section内容通过bpf_map_update_elem()拷贝到map中。因为map本身是0值初始化的,所以.bss不需要拷贝操作。
  3. bpf_program__collect_reloc()中,记录全局变量对应的map,insn index和relocation type
  4. 最后在bpf_program__relocate()进行实际的relocation操作时,标记 ldimm64指令(宽指令):
    1. src_reg = BPF_PSEUDO_MAP_VALUE,值为0x2
    2. 第一个insn中imm字段存储map的file descriptor
    3. 第二个insn中imm字段存储变量在section中的offset
    4. 指令伪代码为 dst = map_val(map_by_fd(imm)) + next_imm

在内核侧,对于标记了BPF_PSEUDO_MAP_VALUE的load指令,会存储实际的目标地址,以进行一个’map-lookup’-free访问。也就是用实际的map value的基址+offset访问,无需map查找。

patch commit message中给出的例子:

# readelf -a test_global_data.o
  [...]
  [ 6] .bss              NOBITS           0000000000000000  00000328
       0000000000000010  0000000000000000  WA       0     0     8
  [ 7] .data             PROGBITS         0000000000000000  00000328
       0000000000000010  0000000000000000  WA       0     0     8
  [ 8] .rodata           PROGBITS         0000000000000000  00000338
       0000000000000018  0000000000000000   A       0     0     8
  [...]
    95: 0000000000000000     8 OBJECT  LOCAL  DEFAULT    6 static_bss
    96: 0000000000000008     8 OBJECT  LOCAL  DEFAULT    6 static_bss2
    97: 0000000000000000     8 OBJECT  LOCAL  DEFAULT    7 static_data
    98: 0000000000000008     8 OBJECT  LOCAL  DEFAULT    7 static_data2
    99: 0000000000000000     8 OBJECT  LOCAL  DEFAULT    8 static_rodata
   100: 0000000000000008     8 OBJECT  LOCAL  DEFAULT    8 static_rodata2
   101: 0000000000000010     8 OBJECT  LOCAL  DEFAULT    8 static_rodata3
  [...]

  # bpftool prog
  103: sched_cls  name load_static_dat  tag 37a8b6822fc39a29  gpl
       loaded_at 2019-02-28T02:02:35+0000  uid 0
       xlated 712B  jited 426B  memlock 4096B  map_ids 63,64,65,66
  # bpftool map show id 63
  63: array  name .bss  flags 0x0                      <-- .bss area, rw
      key 4B  value 16B  max_entries 1  memlock 4096B
  # bpftool map show id 64
  64: array  name .data  flags 0x0                     <-- .data area, rw
      key 4B  value 16B  max_entries 1  memlock 4096B
  # bpftool map show id 65
  65: array  name .rodata  flags 0x80                  <-- .rodata area, ro
      key 4B  value 24B  max_entries 1  memlock 4096B

  # bpftool prog dump xlated id 103
  int load_static_data(struct __sk_buff * skb):
  ; int load_static_data(struct __sk_buff *skb)
     0: (b7) r1 = 0
  ; key = 0;
     1: (63) *(u32 *)(r10 -4) = r1
     2: (bf) r6 = r10
  ; int load_static_data(struct __sk_buff *skb)
     3: (07) r6 += -4
  ; bpf_map_update_elem(&result, &key, &static_bss, 0);
     4: (18) r1 = map[id:66]
     6: (bf) r2 = r6
     7: (18) r3 = map[id:63][0]+0         <-- direct static_bss addr in .bss area
     9: (b7) r4 = 0
    10: (85) call array_map_update_elem#99888
    11: (b7) r1 = 1
  ; key = 1;
    12: (63) *(u32 *)(r10 -4) = r1
  ; bpf_map_update_elem(&result, &key, &static_data, 0);
    13: (18) r1 = map[id:66]
    15: (bf) r2 = r6
    16: (18) r3 = map[id:64][0]+0         <-- direct static_data addr in .data area
    18: (b7) r4 = 0
    19: (85) call array_map_update_elem#99888
    20: (b7) r1 = 2
  ; key = 2;
    21: (63) *(u32 *)(r10 -4) = r1
  ; bpf_map_update_elem(&result, &key, &static_rodata, 0);
    22: (18) r1 = map[id:66]
    24: (bf) r2 = r6
    25: (18) r3 = map[id:65][0]+0         <-- direct static_rodata addr in .rodata area
    27: (b7) r4 = 0
    28: (85) call array_map_update_elem#99888
    29: (b7) r1 = 3
  ; key = 3;
    30: (63) *(u32 *)(r10 -4) = r1
  ; bpf_map_update_elem(&result, &key, &static_bss2, 0);
    31: (18) r7 = map[id:63][0]+8         <--.
    33: (18) r1 = map[id:66]                 |
    35: (bf) r2 = r6                         |
    36: (18) r3 = map[id:63][0]+8         <-- direct static_bss2 addr in .bss area
    38: (b7) r4 = 0
    39: (85) call array_map_update_elem#99888
  [...]

会上提到的另外一种实现思路:

  1. 扩展程序加载流程,BPF loader将全局数据拷贝进内核
  2. prog-local buffer放在prog结构体中prog->aux->global.{data,size}
  3. 程序load时改写指令中访问全局变量的地址 -> verifier重写特殊的LD_IMM_DW指令
  4. src_reg = BPF_PSEUDO_PROG_BUFF, imm = sym->st_value
  5. Generalizing PTR_TO_MAP_VALUE for generic reuse of size limit
  6. Buffer could be RO or RW (e.g. in combination with ’BPF spinlocks’)