[RFC][BPF] Support Jump Table

Yonghong Song · Yonghong Song · commit 7301fc8ba28a · 2025-04-04T07:45:11.000-07:00
NOTE: We probably need cpu v5 or other flags to enable this feature. We can add it later when necessary. This patch adds jump table support. A new insn 'gotox <reg>' is added to allow goto through a register. The register represents the address in the current section. The function is a concrete example with bpf selftest progs/user_ringbuf_success.c. Compilation command line to generate .s file: ============================================= clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian \ -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include \ -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf \ -I/home/yhs/work/bpf-next/tools/include/uapi \ -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -std=gnu11 \ -fno-strict-aliasing -Wno-compare-distinct-pointer-types \ -idirafter /home/yhs/work/llvm-project/llvm/build.21/Release/lib/clang/21/include \ -idirafter /usr/local/include -idirafter /usr/include \ -DENABLE_ATOMICS_TESTS -O2 -S progs/user_ringbuf_success.c \ -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/user_ringbuf_success.bpf.o.s \ --target=bpf -mcpu=v3 The related assembly: read_protocol_msg: ... r3 <<= 3 r1 = .LJTI1_0 ll r1 += r3 r1 = *(u64 *)(r1 + 0) gotox r1 LBB1_4: r1 = *(u64 *)(r0 + 8) goto LBB1_5 LBB1_7: r1 = *(u64 *)(r0 + 8) goto LBB1_8 LBB1_9: w1 = *(u32 *)(r0 + 8) r1 <<= 32 r1 s>>= 32 r2 = kern_mutated ll r3 = *(u64 *)(r2 + 0) r3 *= r1 *(u64 *)(r2 + 0) = r3 goto LBB1_11 LBB1_6: w1 = *(u32 *)(r0 + 8) r1 <<= 32 r1 s>>= 32 LBB1_5: ... .section .rodata,"a",@progbits .p2align 3, 0x0 .LJTI1_0: .quad LBB1_4 .quad LBB1_6 .quad LBB1_7 .quad LBB1_9 ... publish_next_kern_msg: ... r6 <<= 3 r1 = .LJTI6_0 ll r1 += r6 r1 = *(u64 *)(r1 + 0) gotox r1 LBB6_3: ... LBB6_5: ... LBB6_6: ... LBB6_4: ... .section .rodata,"a",@progbits .p2align 3, 0x0 .LJTI6_0: .quad LBB6_3 .quad LBB6_4 .quad LBB6_5 .quad LBB6_6 Now let us look at .o file ========================== clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian \ -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include \ -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf \ -I/home/yhs/work/bpf-next/tools/include/uapi \ -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include \ -std=gnu11 -fno-strict-aliasing -Wno-compare-distinct-pointer-types \ -idirafter /home/yhs/work/llvm-project/llvm/build.21/Release/lib/clang/21/include \ -idirafter /usr/local/include -idirafter /usr/include -DENABLE_ATOMICS_TESTS \ -O2 -c progs/user_ringbuf_success.c \ -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/user_ringbuf_success.bpf.o \ --target=bpf -mcpu=v3 In obj file, all .rodata sections are merged together. So we have $ llvm-readelf -x '.rodata' user_ringbuf_success.bpf.o Hex dump of section '.rodata': 0x00000000 a8020000 00000000 10030000 00000000 ................ 0x00000010 b8020000 00000000 c8020000 00000000 ................ 0x00000020 40040000 00000000 18050000 00000000 @............... 0x00000030 88040000 00000000 d0040000 00000000 ................ 0x00000040 44726169 6e207265 7475726e 65643a20 Drain returned: 0x00000050 256c640a 00556e65 78706563 7465646c %ld..Unexpectedl 0x00000060 79206661 696c6564 20746f20 67657420 y failed to get 0x00000070 6d73670a 00556e72 65636f67 6e697a65 msg..Unrecognize 0x00000080 64206f70 2025640a 00256c75 20213d20 d op %d..%lu != 0x00000090 256c750a 00627066 5f64796e 7074725f %lu..bpf_dynptr_ 0x000000a0 72656164 28292066 61696c65 643a2025 read() failed: % 0x000000b0 640a0055 6e657870 65637465 646c7920 d..Unexpectedly 0x000000c0 6661696c 65642074 6f20676 74207361 failed to get sa 0x000000d0 6d706c65 0a00 mple.. Let us look at the insns. Some annotation explains details. $ llvm-objdump -Sr user_ringbuf_success.bpf.o .... Disassembly of section .text: 0000000000000000 <read_protocol_msg>: ; msg = bpf_dynptr_data(dynptr, 0, sizeof(*msg)); 0: b4 02 00 00 00 00 00 00 w2 = 0x0 1: b4 03 00 00 10 00 00 00 w3 = 0x10 2: 85 00 00 00 cb 00 00 00 call 0xcb ... 0000000000000268 <handle_sample_msg>: ; switch (msg->msg_op) { 77: 61 13 00 00 00 00 00 00 w3 = *(u32 *)(r1 + 0x0) 78: 26 03 1c 00 03 00 00 00 if w3 > 0x3 goto +0x1c <handle_sample_msg+0xf0> 79: 67 03 00 00 03 00 00 00 r3 <<= 0x3 80: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll 0000000000000280: R_BPF_64_64 .rodata <=== r2 will be the address of .rodata with offset 0. <=== look at the first 32 bytes of .rodata: 0x00000000 a8020000 00000000 10030000 00000000 ................ 0x00000010 b8020000 00000000 c8020000 00000000 ................ The four actual addresses are 0x2a8: insn idx 0x2a8/8 = 85 0x310: insn idx 0x310/8 = 98 0x2b8: insn idx 0x2b8/8 = 87 0x2c8: insn idx 0x2c8/8 = 89 82: 0f 32 00 00 00 00 00 00 r2 += r3 83: 79 22 00 00 00 00 00 00 r2 = *(u64 *)(r2 + 0x0) 84: 0d 02 00 00 00 00 00 00 gotox r2 <=== So eventually gotox will go to the insn idx in this section. ; kern_mutated += msg->operand_64; 85: 79 11 08 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x8) 86: 05 00 0e 00 00 00 00 00 goto +0xe <handle_sample_msg+0xc0> ; kern_mutated *= msg->operand_64; 87: 79 11 08 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x8) 88: 05 00 03 00 00 00 00 00 goto +0x3 <handle_sample_msg+0x78> ; kern_mutated *= msg->operand_32; 89: 61 11 08 00 00 00 00 00 w1 = *(u32 *)(r1 + 0x8) 90: 67 01 00 00 20 00 00 00 r1 <<= 0x20 91: c7 01 00 00 20 00 00 00 r1 s>>= 0x20 92: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll ... 00000000000003a0 <publish_next_kern_msg>: ; { 116: bc 16 00 00 00 00 00 00 w6 = w1 ; msg = bpf_ringbuf_reserve(&kernel_ringbuf, sizeof(*msg), 0); 117: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 00000000000003a8: R_BPF_64_64 kernel_ringbuf 119: b7 02 00 00 10 00 00 00 r2 = 0x10 120: b7 03 00 00 00 00 00 00 r3 = 0x0 121: 85 00 00 00 83 00 00 00 call 0x83 ; if (!msg) { 122: 55 00 06 00 00 00 00 00 if r0 != 0x0 goto +0x6 <publish_next_kern_msg+0x68> ; err = 4; 123: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 00000000000003d8: R_BPF_64_64 err 125: b4 02 00 00 04 00 00 00 w2 = 0x4 126: 63 21 00 00 00 00 00 00 *(u32 *)(r1 + 0x0) = w2 127: b4 00 00 00 01 00 00 00 w0 = 0x1 ; return 1; 128: 05 00 31 00 00 00 00 00 goto +0x31 <publish_next_kern_msg+0x1f0> ; switch (index % TEST_MSG_OP_NUM_OPS) { 129: 54 06 00 00 03 00 00 00 w6 &= 0x3 130: 67 06 00 00 03 00 00 00 r6 <<= 0x3 131: 18 01 00 00 20 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x20 ll 0000000000000418: R_BPF_64_64 .rodata <=== r2 will be the address of .rodata with offset 20. <=== look at the first 32 bytes of .rodata: 0x00000020 40040000 00000000 18050000 00000000 @............... 0x00000030 88040000 00000000 d0040000 00000000 ................ The four actual addresses are 0x440: insn idx 0x440/8 = 136 0x518: insn idx 0x518/8 = 163 0x488: insn idx 0x488/8 = 145 0x4d0: insn idx 0x4d0/8 = 154 133: 0f 61 00 00 00 00 00 00 r1 += r6 134: 79 11 00 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x0) 135: 0d 01 00 00 00 00 00 00 gotox r1 <=== So eventually gotox will go to the insn idx in this section. 136: b4 01 00 00 00 00 00 00 w1 = 0x0 ; msg->msg_op = TEST_MSG_OP_INC64; 137: 63 10 00 00 00 00 00 00 *(u32 *)(r0 + 0x0) = w1 138: b7 01 00 00 04 00 00 00 r1 = 0x4 ; msg->operand_64 = operand_64; 139: 7b 10 08 00 00 00 00 00 *(u64 *)(r0 + 0x8) = r1 ; expected_user_mutated += operand_64; 140: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 0000000000000460: R_BPF_64_64 expected_user_mutated 142: 79 11 00 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x0) 143: 07 01 00 00 04 00 00 00 r1 += 0x4 ; break; 144: 05 00 1a 00 00 00 00 00 goto +0x1a <publish_next_kern_msg+0x1b8> 145: b4 01 00 00 02 00 00 00 w1 = 0x2 ; msg->msg_op = TEST_MSG_OP_MUL64; ... There are a few things worth to discuss. First, in the above, it is hard to find jump table size for a particular relocation ('R_BPF_64_64 .rodata + <offset>'). One thing is to scan through the whole elf file and you can find all '.rodata + <offset>' relocations. For example, here we have .rodata + 0 .rodata + 0x20 .rodata + 0x40 .rodata + 0x55 .rodata + 0x75 .rodata + 0x89 .rodata + 0x95 .rodata + 0xb3 With the above information, the size for each sub-rodata can be found easily. An option -bpf-min-jump-table-entries is implemented to control the minimum number of entries to use a jump table on BPF. The default value 4, but it can be changed with the following clang option clang ... -mllvm -bpf-min-jump-table-entries=6 where the number of jump table cases needs to be >= 6 in order to use jump table.
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -36,6 +36,10 @@ static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
   cl::Hidden, cl::init(false),
   cl::desc("Expand memcpy into load/store pairs in order"));
 
+static cl::opt<unsigned> BPFMinimumJumpTableEntries(
+    "bpf-min-jump-table-entries", cl::init(4), cl::Hidden,
+    cl::desc("Set minimum number of entries to use a jump table on BPF"));
+
 static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg,
                  SDValue Val = {}) {
   std::string Str;
@@ -65,10 +69,11 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 
-  setOperationAction({ISD::GlobalAddress, ISD::ConstantPool}, MVT::i64, Custom);
+  setOperationAction({ISD::GlobalAddress, ISD::ConstantPool, ISD::JumpTable,
+                      ISD::BlockAddress},
+                     MVT::i64, Custom);
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
@@ -155,6 +160,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   setBooleanContents(ZeroOrOneBooleanContent);
   setMaxAtomicSizeInBitsSupported(64);
+  setMinimumJumpTableEntries(BPFMinimumJumpTableEntries);
 
   // Function alignments
   setMinFunctionAlignment(Align(8));
@@ -312,10 +318,14 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     report_fatal_error("unimplemented opcode: " + Twine(Op.getOpcode()));
   case ISD::BR_CC:
     return LowerBR_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
   case ISD::SDIV:
@@ -726,6 +736,11 @@ SDValue BPFTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
   return Op;
 }
 
+SDValue BPFTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  return getAddr(N, DAG);
+}
+
 const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((BPFISD::NodeType)Opcode) {
   case BPFISD::FIRST_NUMBER:
@@ -757,6 +772,17 @@ static SDValue getTargetNode(ConstantPoolSDNode *N, const SDLoc &DL, EVT Ty,
                                    N->getOffset(), Flags);
 }
 
+static SDValue getTargetNode(BlockAddressSDNode *N, const SDLoc &DL, EVT Ty,
+                             SelectionDAG &DAG, unsigned Flags) {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
+                                   Flags);
+}
+
+static SDValue getTargetNode(JumpTableSDNode *N, const SDLoc &DL, EVT Ty,
+                             SelectionDAG &DAG, unsigned Flags) {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flags);
+}
+
 template <class NodeTy>
 SDValue BPFTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
                                    unsigned Flags) const {
@@ -783,6 +809,12 @@ SDValue BPFTargetLowering::LowerConstantPool(SDValue Op,
   return getAddr(N, DAG);
 }
 
+SDValue BPFTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+  return getAddr(N, DAG);
+}
+
 unsigned
 BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Reg, bool isSigned) const {
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -80,6 +80,8 @@ class BPFTargetLowering : public TargetLowering {
   SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
 
   template <class NodeTy>
   SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -183,6 +183,15 @@ class TYPE_LD_ST<bits<3> mode, bits<2> size,
   let Inst{60-59} = size;
 }
 
+// For indirect jump
+class TYPE_IND_JMP<bits<4> op, bits<1> srctype,
+                   dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstBPF<outs, ins, asmstr, pattern> {
+
+  let Inst{63-60} = op;
+  let Inst{59} = srctype;
+}
+
 // jump instructions
 class JMP_RR<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
     : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
@@ -216,6 +225,18 @@ class JMP_RI<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
   let BPFClass = BPF_JMP;
 }
 
+class JMP_IND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
+    : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
+                   (outs),
+                   (ins GPR:$dst),
+                   !strconcat(OpcodeStr, " $dst"),
+                   Pattern> {
+  bits<4> dst;
+
+  let Inst{51-48} = dst;
+  let BPFClass = BPF_JMP;
+}
+
 class JMP_JCOND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
     : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
                    (outs),
@@ -281,6 +302,10 @@ defm JSLT : J<BPF_JSLT, "s<", BPF_CC_LT, BPF_CC_LT_32>;
 defm JSLE : J<BPF_JSLE, "s<=", BPF_CC_LE, BPF_CC_LE_32>;
 defm JSET : J<BPF_JSET, "&", NoCond, NoCond>;
 def JCOND : JMP_JCOND<BPF_JCOND, "may_goto", []>;
+
+let isIndirectBranch = 1 in {
+  def JX : JMP_IND<BPF_JA, "gotox", [(brind i64:$dst)]>;
+}
 }
 
 // ALU instructions
@@ -851,6 +876,8 @@ let usesCustomInserter = 1, isCodeGenOnly = 1 in {
 // load 64-bit global addr into register
 def : Pat<(BPFWrapper tglobaladdr:$in), (LD_imm64 tglobaladdr:$in)>;
 def : Pat<(BPFWrapper tconstpool:$in), (LD_imm64 tconstpool:$in)>;
+def : Pat<(BPFWrapper tblockaddress:$in), (LD_imm64 tblockaddress:$in)>;
+def : Pat<(BPFWrapper tjumptable:$in), (LD_imm64 tjumptable:$in)>;
 
 // 0xffffFFFF doesn't fit into simm32, optimize common case
 def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
diff --git a/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/llvm/lib/Target/BPF/BPFMCInstLower.cpp
@@ -77,6 +77,9 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     case MachineOperand::MO_ConstantPoolIndex:
       MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
       break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+      break;
     }
 
     OutMI.addOperand(MCOp);

Original file line number	Diff line number	Diff line change
`@@ -77,6 +77,9 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {`
`77`	`77`	`case MachineOperand::MO_ConstantPoolIndex:`
`78`	`78`	`MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));`
`79`	`79`	`break;`
	`80`	`+ case MachineOperand::MO_JumpTableIndex:`
	`81`	`+ MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));`
	`82`	`+ break;`
`80`	`83`	`}`
`81`	`84`
`82`	`85`	`OutMI.addOperand(MCOp);`