eBPF is a revolutionary technology with origins in the Linux kernel that can run sandboxed programs in a privileged context such as the operating system kernel. It is used to safely and efficiently extend the capabilities of the kernel without requiring to change kernel source code or load kernel modules.
Historically, the operating system has always been an ideal place to implement observability, security, and networking functionality due to the kernel’s privileged ability to oversee and control the entire system. At the same time, an operating system kernel is hard to evolve due to its central role and high requirement towards stability and security. The rate of innovation at the operating system level has thus traditionally been lower compared to functionality implemented outside of the operating system.
eBPF changes this formula fundamentally. By allowing to run sandboxed programs within the operating system, application developers can run eBPF programs to add additional capabilities to the operating system at runtime. The operating system then guarantees safety and execution efficiency as if natively compiled with the aid of a Just-In-Time (JIT) compiler and verification engine. This has led to a wave of eBPF-based projects covering a wide array of use cases, including next-generation networking, observability, and security functionality.
Today, eBPF is used extensively to drive a wide variety of use cases: Providing high-performance networking and load-balancing in modern data centers and cloud native environments, extracting fine-grained security observability data at low overhead, helping application developers trace applications, providing insights for performance troubleshooting, preventive application and container runtime security enforcement, and much more. The possibilities are endless, and the innovation that eBPF is unlocking has only just begun.
on load_bpf_programs # Linux 5.16-rc1 has changed the default to 2 (disabled but changeable), # but we need 0 write /proc/sys/kernel/unprivileged_bpf_disabled 0 # Enable the eBPF JIT -- but do note that on 64-bit kernels it is likely # already force enabled by the kernel config option BPF_JIT_ALWAYS_ON write /proc/sys/net/core/bpf_jit_enable 1 # Enable JIT kallsyms exportfor privileged users only write /proc/sys/net/core/bpf_jit_kallsyms 1 exec_start bpfloader service bpfloader /system/bin/bpfloader capabilities CHOWN SYS_ADMIN NET_ADMIN # # Set RLIMIT_MEMLOCK to 1GiB for bpfloader # # Actually only 8MiB would be needed if bpfloader ran as its own uid. # # As such we simply use 1GiB as a reasonable approximation of infinity. # rlimit memlock 10737418241073741824 oneshot reboot_on_failure reboot,bpfloader-failed updatable
// Create all the pin subdirectories // (this must be done first to allow selinux_context and pin_subdir functionality, // which could otherwise fail with ENOENT during object pinning or renaming, // due to ordering issues) for (constauto& location : locations) { createSysFsBpfSubDir(location.prefix); }
// Load all ELF objects, create programs and maps, and pin them for (constauto& location : locations) { if (loadAllElfObjects(location) != 0) { ALOGE("=== CRITICAL FAILURE LOADING BPF PROGRAMS FROM %s ===", location.dir); ALOGE("If this triggers reliably, you're probably missing kernel options or patches."); ALOGE("If this triggers randomly, you might be hitting some memory allocation " "problems or startup script race."); ALOGE("--- DO NOT EXPECT SYSTEM TO BOOT SUCCESSFULLY ---"); sleep(20); return2; } }
if (android::base::SetProperty("bpf.progs_loaded", "1") == false) { ALOGE("Failed to set bpf.progs_loaded property"); return1; }
intloadProg(constchar* elfPath, bool* isCritical, constchar* prefix, constunsignedlonglong allowedDomainBitmask, const bpf_prog_type* allowed, size_t numAllowed){ vector<char> license; vector<char> critical; vector<codeSection> cs; vector<unique_fd> mapFds; int ret; if (!isCritical) return-1; *isCritical = false; ifstream elfFile(elfPath, ios::in | ios::binary); if (!elfFile.is_open()) return-1; //检查证书 ret = readSectionByName("critical", elfFile, critical); *isCritical = !ret; ret = readSectionByName("license", elfFile, license); . . . //检查bpf版本 // the following default values are for bpfloader V0.0 format which does not include them unsignedint bpfLoaderMinVer = readSectionUint("bpfloader_min_ver", elfFile, DEFAULT_BPFLOADER_MIN_VER); unsignedint bpfLoaderMaxVer = readSectionUint("bpfloader_max_ver", elfFile, DEFAULT_BPFLOADER_MAX_VER); size_t sizeOfBpfMapDef = readSectionUint("size_of_bpf_map_def", elfFile, DEFAULT_SIZEOF_BPF_MAP_DEF); size_t sizeOfBpfProgDef = readSectionUint("size_of_bpf_prog_def", elfFile, DEFAULT_SIZEOF_BPF_PROG_DEF); // inclusive lower bound check if (BPFLOADER_VERSION < bpfLoaderMinVer) { ALOGI("BpfLoader version 0x%05x ignoring ELF object %s with min ver 0x%05x", BPFLOADER_VERSION, elfPath, bpfLoaderMinVer); return0; } // exclusive upper bound check if (BPFLOADER_VERSION >= bpfLoaderMaxVer) { ALOGI("BpfLoader version 0x%05x ignoring ELF object %s with max ver 0x%05x", BPFLOADER_VERSION, elfPath, bpfLoaderMaxVer); return0; } ALOGI("BpfLoader version 0x%05x processing ELF object %s with ver [0x%05x,0x%05x)", BPFLOADER_VERSION, elfPath, bpfLoaderMinVer, bpfLoaderMaxVer); if (sizeOfBpfMapDef < DEFAULT_SIZEOF_BPF_MAP_DEF) { ALOGE("sizeof(bpf_map_def) of %zu is too small (< %d)", sizeOfBpfMapDef, DEFAULT_SIZEOF_BPF_MAP_DEF); return-1; } //检查bpfprog堆栈大小 if (sizeOfBpfProgDef < DEFAULT_SIZEOF_BPF_PROG_DEF) { ALOGE("sizeof(bpf_prog_def) of %zu is too small (< %d)", sizeOfBpfProgDef, DEFAULT_SIZEOF_BPF_PROG_DEF); return-1; } ret = readCodeSections(elfFile, cs, sizeOfBpfProgDef, allowed, numAllowed); if (ret) { ALOGE("Couldn't read all code sections in %s", elfPath); return ret; } /* Just for future debugging */ if (0) dumpAllCs(cs); ret = createMaps(elfPath, elfFile, mapFds, prefix, allowedDomainBitmask, sizeOfBpfMapDef); if (ret) { ALOGE("Failed to create maps: (ret=%d) in %s", ret, elfPath); return ret; } for (int i = 0; i < (int)mapFds.size(); i++) ALOGD("map_fd found at %d is %d in %s", i, mapFds[i].get(), elfPath); applyMapRelo(elfFile, mapFds, cs); ret = loadCodeSections(elfPath, cs, string(license.data()), prefix, allowedDomainBitmask); if (ret) ALOGE("Failed to load programs, loadCodeSections ret=%d", ret); return ret; }
/* Read a section by its index - for ex to get sec hdr strtab blob */ staticintreadCodeSections(ifstream& elfFile, vector<codeSection>& cs, size_t sizeOfBpfProgDef, const bpf_prog_type* allowed, size_t numAllowed){ vector<Elf64_Shdr> shTable; int entries, ret = 0;
ret = readSectionHeadersAll(elfFile, shTable); if (ret) return ret; entries = shTable.size();
vector<struct bpf_prog_def> pd; ret = readProgDefs(elfFile, pd, sizeOfBpfProgDef); if (ret) return ret; vector<string> progDefNames; ret = getSectionSymNames(elfFile, "progs", progDefNames); if (!pd.empty() && ret) return ret;
for (int i = 0; i < entries; i++) { string name; codeSection cs_temp; cs_temp.type = BPF_PROG_TYPE_UNSPEC;
ret = getSymName(elfFile, shTable[i].sh_name, name); if (ret) return ret;
enum bpf_prog_type ptype = getSectionType(name);
if (ptype == BPF_PROG_TYPE_UNSPEC) continue;
if (!IsAllowed(ptype, allowed, numAllowed)) { ALOGE("Program type %s not permitted here", getSectionName(ptype).c_str()); return-1; }
// This must be done before '/' is replaced with '_'. cs_temp.expected_attach_type = getExpectedAttachType(name);
string oldName = name;
// convert all slashes to underscores std::replace(name.begin(), name.end(), '/', '_');
cs_temp.type = ptype; cs_temp.name = name;
ret = readSectionByIdx(elfFile, i, cs_temp.data); if (ret) return ret; ALOGD("Loaded code section %d (%s)", i, name.c_str());
vector<string> csSymNames; ret = getSectionSymNames(elfFile, oldName, csSymNames, STT_FUNC); if (ret || !csSymNames.size()) return ret; for (size_t i = 0; i < progDefNames.size(); ++i) { if (!progDefNames[i].compare(csSymNames[0] + "_def")) { cs_temp.prog_def = pd[i]; break; } }
/* Check for rel section */ if (cs_temp.data.size() > 0 && i < entries) { ret = getSymName(elfFile, shTable[i + 1].sh_name, name); if (ret) return ret;
if (name == (".rel" + oldName)) { ret = readSectionByIdx(elfFile, i + 1, cs_temp.rel_data); if (ret) return ret; ALOGD("Loaded relo section %d (%s)", i, name.c_str()); } }
if (cs_temp.data.size() > 0) { cs.push_back(std::move(cs_temp)); ALOGD("Adding section %d to cs list", i); } } return0; }
/* * Map section name prefixes to program types, the section name will be: * SECTION(<prefix>/<name-of-program>) * For example: * SECTION("tracepoint/sched_switch_func") where sched_switch_funcs * is the name of the program, and tracepoint is the type. * * However, be aware that you should not be directly using the SECTION() macro. * Instead use the DEFINE_(BPF|XDP)_(PROG|MAP)... & LICENSE/CRITICAL macros. */ sectionType sectionNameTypes[] = { {"bind4/", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND}, {"bind6/", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND}, {"cgroupskb/", BPF_PROG_TYPE_CGROUP_SKB, BPF_ATTACH_TYPE_UNSPEC}, {"cgroupsock/", BPF_PROG_TYPE_CGROUP_SOCK, BPF_ATTACH_TYPE_UNSPEC}, {"kprobe/", BPF_PROG_TYPE_KPROBE, BPF_ATTACH_TYPE_UNSPEC}, {"perf_event/", BPF_PROG_TYPE_PERF_EVENT, BPF_ATTACH_TYPE_UNSPEC}, {"schedact/", BPF_PROG_TYPE_SCHED_ACT, BPF_ATTACH_TYPE_UNSPEC}, {"schedcls/", BPF_PROG_TYPE_SCHED_CLS, BPF_ATTACH_TYPE_UNSPEC}, {"skfilter/", BPF_PROG_TYPE_SOCKET_FILTER, BPF_ATTACH_TYPE_UNSPEC}, {"tracepoint/", BPF_PROG_TYPE_TRACEPOINT, BPF_ATTACH_TYPE_UNSPEC}, {"xdp/", BPF_PROG_TYPE_XDP, BPF_ATTACH_TYPE_UNSPEC}, };
ret = readSectionByName("maps", elfFile, mdData); if (ret == -2) return0; // no maps to read if (ret) return ret; ... ret = getSectionSymNames(elfFile, "maps", mapNames); if (ret) return ret;
for (int i = 0; i < (int)cs.size(); i++) { string name = cs[i].name; unsigned bpfMinVer = DEFAULT_BPFLOADER_MIN_VER; // v0.0 unsigned bpfMaxVer = DEFAULT_BPFLOADER_MAX_VER; // v1.0 domain selinux_context = domain::unspecified; domain pin_subdir = domain::unspecified;
if (cs[i].prog_def.has_value()) { unsigned min_kver = cs[i].prog_def->min_kver; unsigned max_kver = cs[i].prog_def->max_kver; ALOGD("cs[%d].name:%s min_kver:%x .max_kver:%x (kvers:%x)", i, name.c_str(), min_kver, max_kver, kvers); if (kvers < min_kver) continue; if (kvers >= max_kver) continue;
bpfMinVer = cs[i].prog_def->bpfloader_min_ver; bpfMaxVer = cs[i].prog_def->bpfloader_max_ver; selinux_context = getDomainFromSelinuxContext(cs[i].prog_def->selinux_context); pin_subdir = getDomainFromPinSubdir(cs[i].prog_def->pin_subdir); // Note: make sure to only check for unrecognized *after* verifying bpfloader // version limits include this bpfloader's version. }
ALOGD("cs[%d].name:%s requires bpfloader version [0x%05x,0x%05x)", i, name.c_str(), bpfMinVer, bpfMaxVer); if (BPFLOADER_VERSION < bpfMinVer) continue; if (BPFLOADER_VERSION >= bpfMaxVer) continue; if (unrecognized(pin_subdir)) return -ENOTDIR;
if (specified(selinux_context)) { if (!inDomainBitmask(selinux_context, allowedDomainBitmask)) { ALOGE("prog %s has invalid selinux_context of %d (allowed bitmask 0x%llx)", name.c_str(), selinux_context, allowedDomainBitmask); return -EINVAL; } ALOGI("prog %s selinux_context [%32s] -> %d -> '%s' (%s)", name.c_str(), cs[i].prog_def->selinux_context, selinux_context, lookupSelinuxContext(selinux_context), lookupPinSubdir(selinux_context)); }
if (specified(pin_subdir)) { if (!inDomainBitmask(pin_subdir, allowedDomainBitmask)) { ALOGE("prog %s has invalid pin_subdir of %d (allowed bitmask 0x%llx)", name.c_str(), pin_subdir, allowedDomainBitmask); return -EINVAL; } ALOGI("prog %s pin_subdir [%32s] -> %d -> '%s'", name.c_str(), cs[i].prog_def->pin_subdir, pin_subdir, lookupPinSubdir(pin_subdir)); }
// strip any potential $foo suffix // this can be used to provide duplicate programs // conditionally loaded based on running kernel version name = name.substr(0, name.find_last_of('$'));
/** * Set firewall rule for uid * * @param childChain target chain * @param uid uid to allow/deny * @param firewallRule either FIREWALL_RULE_ALLOW or FIREWALL_RULE_DENY * @throws ServiceSpecificException in case of failure, with an error code indicating the * cause of the failure. */ publicvoidsetUidRule(finalint childChain, finalint uid, finalint firewallRule){ throwIfPreT("setUidRule is not available on pre-T devices");
int TrafficController::toggleUidOwnerMap(ChildChain chain, bool enable) { std::lock_guard guard(mMutex); uint32_t key = UID_RULES_CONFIGURATION_KEY; auto oldConfigure = mConfigurationMap.readValue(key); if (!oldConfigure.ok()) { ALOGE("Cannot read the old configuration from map: %s", oldConfigure.error().message().c_str()); return -oldConfigure.error().code(); } uint32_t match; switch (chain) { case DOZABLE: match = DOZABLE_MATCH; break; case STANDBY: match = STANDBY_MATCH; break; case POWERSAVE: match = POWERSAVE_MATCH; break; case RESTRICTED: match = RESTRICTED_MATCH; break; case LOW_POWER_STANDBY: match = LOW_POWER_STANDBY_MATCH; break; case OEM_DENY_1: match = OEM_DENY_1_MATCH; break; case OEM_DENY_2: match = OEM_DENY_2_MATCH; break; case OEM_DENY_3: match = OEM_DENY_3_MATCH; break; default: return -EINVAL; } BpfConfig newConfiguration = enable ? (oldConfigure.value() | match) : (oldConfigure.value() & ~match); Status res = mConfigurationMap.writeValue(key, newConfiguration, BPF_EXIST); if (!isOk(res)) { ALOGE("Failed to toggleUidOwnerMap(%d): %s", chain, res.msg().c_str()); } return -res.code(); }
/* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ inttcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; structsock *rsk;
//假设需要我们处理的是 ESTABLISH 状态下的包 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ structdst_entry *dst;
#ifdef CONFIG_CGROUP_BPF staticvoidbpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { ..... /* The skb will be handled in the * bpf_skops_established() or * bpf_skops_write_hdr_opt(). */ switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_SYN_SENT: case TCP_LISTEN: return; }
/** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. * @type: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * * The program type passed in via @type must be suitable for sock_ops * filtering. No further check is performed to assert that. * * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum cgroup_bpf_attach_type atype) { structcgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
// DROP_IF_SET is set of rules that DROP if rule is globally enabled, and per-uid bit is set #define DROP_IF_SET (STANDBY_MATCH | OEM_DENY_1_MATCH | OEM_DENY_2_MATCH | OEM_DENY_3_MATCH) // DROP_IF_UNSET is set of rules that should DROP if globally enabled, and per-uid bit is NOT set #define DROP_IF_UNSET (DOZABLE_MATCH | POWERSAVE_MATCH | RESTRICTED_MATCH | LOW_POWER_STANDBY_MATCH)
// Warning: funky bit-wise arithmetic: in parallel, for all DROP_IF_SET/UNSET rules // check whether the rules are globally enabled, and if so whether the rules are // set/unset for the specific uid. DROP if that is the case for ANY of the rules. // We achieve this by masking out only the bits/rules we're interested in checking, // and negating (via bit-wise xor) the bits/rules that should drop if unset. if (enabledRules & (DROP_IF_SET | DROP_IF_UNSET) & (uidRules ^ DROP_IF_UNSET)) return DROP;
if (!egress && skb->ifindex != 1) { if (uidRules & IIF_MATCH) { if (allowed_iif && skb->ifindex != allowed_iif) { // Drops packets not coming from lo nor the allowed interface // allowed interface=0 is a wildcard and does not drop packets return DROP_UNLESS_DNS; } } elseif (uidRules & LOCKDOWN_VPN_MATCH) { // Drops packets not coming from lo and rule does not have IIF_MATCH but has // LOCKDOWN_VPN_MATCH return DROP_UNLESS_DNS; } } return PASS; }
This is copyright.