From 2cc512c1fa1ee99879d55d1cb4e3fd0e6eab35b3 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 24 Jul 2018 10:55:24 +0800 Subject: [PATCH 01/45] bpf: btf: fix inconsistent IS_ERR and PTR_ERR Fix inconsistent IS_ERR and PTR_ERR in get_btf, the proper pointer to be passed as argument is '*btf' This issue was detected with the help of Coccinelle. Fixes: 2d3feca8c44f ("bpf: btf: print map dump and lookup with btf info") Signed-off-by: YueHaibing Acked-by: David S. Miller Acked-by: Jakub Kicinski Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 9c8191845585..0ee3ba479d87 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -230,7 +230,7 @@ static int get_btf(struct bpf_map_info *map_info, struct btf **btf) *btf = btf__new((__u8 *)btf_info.btf, btf_info.btf_size, NULL); if (IS_ERR(*btf)) { - err = PTR_ERR(btf); + err = PTR_ERR(*btf); *btf = NULL; } From e66565f3bee141748d2c3b2ed0d4ecd455f634fa Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Tue, 24 Jul 2018 15:53:34 -0400 Subject: [PATCH 02/45] bpf: Add Python 3 support to selftests scripts for bpf Adjust tcp_client.py and tcp_server.py to work with Python 3 by using the print function, marking string literals as bytes, and using the newer exception syntax. This should be functionally equivalent and supports Python 3+. Signed-off-by: Jeremy Cline Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/tcp_client.py | 12 ++++++------ tools/testing/selftests/bpf/tcp_server.py | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py index 481dccdf140c..7f8200a8702b 100755 --- a/tools/testing/selftests/bpf/tcp_client.py +++ b/tools/testing/selftests/bpf/tcp_client.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # # SPDX-License-Identifier: GPL-2.0 # @@ -9,11 +9,11 @@ import subprocess import select def read(sock, n): - buf = '' + buf = b'' while len(buf) < n: rem = n - len(buf) try: s = sock.recv(rem) - except (socket.error), e: return '' + except (socket.error) as e: return b'' buf += s return buf @@ -22,7 +22,7 @@ def send(sock, s): count = 0 while count < total: try: n = sock.send(s) - except (socket.error), e: n = 0 + except (socket.error) as e: n = 0 if n == 0: return count; count += n @@ -39,10 +39,10 @@ try: except socket.error as e: sys.exit(1) -buf = '' +buf = b'' n = 0 while n < 1000: - buf += '+' + buf += b'+' n += 1 sock.settimeout(1); diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py index bc454d7d0be2..b39903fca4c8 100755 --- a/tools/testing/selftests/bpf/tcp_server.py +++ b/tools/testing/selftests/bpf/tcp_server.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # # SPDX-License-Identifier: GPL-2.0 # @@ -9,11 +9,11 @@ import subprocess import select def read(sock, n): - buf = '' + buf = b'' while len(buf) < n: rem = n - len(buf) try: s = sock.recv(rem) - except (socket.error), e: return '' + except (socket.error) as e: return b'' buf += s return buf @@ -22,7 +22,7 @@ def send(sock, s): count = 0 while count < total: try: n = sock.send(s) - except (socket.error), e: n = 0 + except (socket.error) as e: n = 0 if n == 0: return count; count += n @@ -43,7 +43,7 @@ host = socket.gethostname() try: serverSocket.bind((host, 0)) except socket.error as msg: - print 'bind fails: ', msg + print('bind fails: ' + str(msg)) sn = serverSocket.getsockname() serverPort = sn[1] @@ -51,10 +51,10 @@ serverPort = sn[1] cmdStr = ("./tcp_client.py %d &") % (serverPort) os.system(cmdStr) -buf = '' +buf = b'' n = 0 while n < 500: - buf += '.' + buf += b'.' n += 1 serverSocket.listen(MAX_PORTS) @@ -79,5 +79,5 @@ while True: serverSocket.close() sys.exit(0) else: - print 'Select timeout!' + print('Select timeout!') sys.exit(1) From 08a852528e9678f0854af331f19747f2b2a73c06 Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Wed, 25 Jul 2018 15:36:51 +0900 Subject: [PATCH 03/45] tools/bpftool: ignore build products For untracked things of tools/bpf, add this. Reviewed-by: Jakub Kicinski Signed-off-by: Taeung Song Signed-off-by: Daniel Borkmann --- tools/bpf/.gitignore | 5 +++++ tools/bpf/bpftool/.gitignore | 2 ++ 2 files changed, 7 insertions(+) create mode 100644 tools/bpf/.gitignore diff --git a/tools/bpf/.gitignore b/tools/bpf/.gitignore new file mode 100644 index 000000000000..dfe2bd5a4b95 --- /dev/null +++ b/tools/bpf/.gitignore @@ -0,0 +1,5 @@ +FEATURE-DUMP.bpf +bpf_asm +bpf_dbg +bpf_exp.yacc.* +bpf_jit_disasm diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index d7e678c2d396..67167e44b726 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -1,3 +1,5 @@ *.d bpftool +bpftool*.8 +bpf-helpers.* FEATURE-DUMP.bpftool From 598135e7444c121f11c8c16495ba1e6ab122678f Mon Sep 17 00:00:00 2001 From: Brian Brooks Date: Wed, 25 Jul 2018 16:08:19 -0500 Subject: [PATCH 04/45] samples/bpf: xdpsock: order memory on AArch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define u_smp_rmb() and u_smp_wmb() to respective barrier instructions. This ensures the processor will order accesses to queue indices against accesses to queue ring entries. Signed-off-by: Brian Brooks Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- samples/bpf/xdpsock_user.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 5904b1543831..1e82f7c617c3 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -145,8 +145,13 @@ static void dump_stats(void); } while (0) #define barrier() __asm__ __volatile__("": : :"memory") +#ifdef __aarch64__ +#define u_smp_rmb() __asm__ __volatile__("dmb ishld": : :"memory") +#define u_smp_wmb() __asm__ __volatile__("dmb ishst": : :"memory") +#else #define u_smp_rmb() barrier() #define u_smp_wmb() barrier() +#endif #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) From 9778cfdfc9d94396d1464630073d696cdea84037 Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Thu, 26 Jul 2018 19:13:44 +0900 Subject: [PATCH 05/45] samples/bpf: Add BTF build flags to Makefile To smoothly test BTF supported binary on samples/bpf, let samples/bpf/Makefile probe llc, pahole and llvm-objcopy for BPF support and use them like tools/testing/selftests/bpf/Makefile changed from the commit c0fa1b6c3efc ("bpf: btf: Add BTF tests"). Signed-off-by: Taeung Song Acked-by: Martin KaFai Lau Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- samples/bpf/Makefile | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 9ea2f7b64869..77f512625b8c 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -195,6 +195,8 @@ HOSTLOADLIBES_xdpsock += -pthread # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang LLC ?= llc CLANG ?= clang +LLVM_OBJCOPY ?= llvm-objcopy +BTF_PAHOLE ?= pahole # Detect that we're cross compiling and use the cross compiler ifdef CROSS_COMPILE @@ -202,6 +204,16 @@ HOSTCC = $(CROSS_COMPILE)gcc CLANG_ARCH_ARGS = -target $(ARCH) endif +BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) +BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) +BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') + +ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),) + EXTRA_CFLAGS += -g + LLC_FLAGS += -mattr=dwarfris + DWARF2BTF = y +endif + # Trick to allow make to be run from this directory all: $(MAKE) -C ../../ $(CURDIR)/ BPF_SAMPLES_PATH=$(CURDIR) @@ -260,4 +272,7 @@ $(obj)/%.o: $(src)/%.c -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ + -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ +ifeq ($(DWARF2BTF),y) + $(BTF_PAHOLE) -J $@ +endif From 78a8b760e40881c883c5552db991664727a91868 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 25 Jul 2018 19:53:30 -0700 Subject: [PATCH 06/45] nfp: move repr handling on RX path Representor packets are received on PF queues with special metadata tag for demux. There is no reason to resolve the representor ID -> netdev after the skb has been allocated. Move the code, this will allow us to handle special FW messages without SKB allocation overhead. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- .../ethernet/netronome/nfp/nfp_net_common.c | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index cf1704e972b7..cdfbd1e8bf4b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1757,6 +1757,21 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) } } + if (likely(!meta.portid)) { + netdev = dp->netdev; + } else { + struct nfp_net *nn; + + nn = netdev_priv(dp->netdev); + netdev = nfp_app_repr_get(nn->app, meta.portid); + if (unlikely(!netdev)) { + nfp_net_rx_drop(dp, r_vec, rx_ring, rxbuf, + NULL); + continue; + } + nfp_repr_inc_rx_stats(netdev, pkt_len); + } + skb = build_skb(rxbuf->frag, true_bufsz); if (unlikely(!skb)) { nfp_net_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL); @@ -1772,20 +1787,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr); - if (likely(!meta.portid)) { - netdev = dp->netdev; - } else { - struct nfp_net *nn; - - nn = netdev_priv(dp->netdev); - netdev = nfp_app_repr_get(nn->app, meta.portid); - if (unlikely(!netdev)) { - nfp_net_rx_drop(dp, r_vec, rx_ring, NULL, skb); - continue; - } - nfp_repr_inc_rx_stats(netdev, pkt_len); - } - skb_reserve(skb, pkt_off); skb_put(skb, pkt_len); From 79ca38e80c4588adeb11a0abd116d72ab6fe0ecc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 25 Jul 2018 19:53:31 -0700 Subject: [PATCH 07/45] nfp: allow control message reception on data queues Port id 0xffffffff is reserved for control messages. Allow reception of messages with this id on data queues. Hand off a raw buffer to the higher layer code, without allocating SKB for max efficiency. The RX handle can't modify or keep the buffer, after it returns buffer is handed back over to the NIC RX free buffer list. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_app.c | 2 ++ drivers/net/ethernet/netronome/nfp/nfp_app.h | 17 +++++++++++++++++ .../net/ethernet/netronome/nfp/nfp_net_common.c | 11 +++++++++++ .../net/ethernet/netronome/nfp/nfp_net_ctrl.h | 1 + 4 files changed, 31 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c index 69d4ae7a61f3..8607d09ab732 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c @@ -172,6 +172,8 @@ struct nfp_app *nfp_app_alloc(struct nfp_pf *pf, enum nfp_app_id id) if (WARN_ON(!apps[id]->name || !apps[id]->vnic_alloc)) return ERR_PTR(-EINVAL); + if (WARN_ON(!apps[id]->ctrl_msg_rx && apps[id]->ctrl_msg_rx_raw)) + return ERR_PTR(-EINVAL); app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index afbc19aa66a8..ccb244cf6c30 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -98,6 +98,7 @@ extern const struct nfp_app_type app_abm; * @start: start application logic * @stop: stop application logic * @ctrl_msg_rx: control message handler + * @ctrl_msg_rx_raw: handler for control messages from data queues * @setup_tc: setup TC ndo * @bpf: BPF ndo offload-related calls * @xdp_offload: offload an XDP program @@ -150,6 +151,8 @@ struct nfp_app_type { void (*stop)(struct nfp_app *app); void (*ctrl_msg_rx)(struct nfp_app *app, struct sk_buff *skb); + void (*ctrl_msg_rx_raw)(struct nfp_app *app, const void *data, + unsigned int len); int (*setup_tc)(struct nfp_app *app, struct net_device *netdev, enum tc_setup_type type, void *type_data); @@ -318,6 +321,11 @@ static inline bool nfp_app_ctrl_has_meta(struct nfp_app *app) return app->type->ctrl_has_meta; } +static inline bool nfp_app_ctrl_uses_data_vnics(struct nfp_app *app) +{ + return app && app->type->ctrl_msg_rx_raw; +} + static inline const char *nfp_app_extra_cap(struct nfp_app *app, struct nfp_net *nn) { @@ -381,6 +389,15 @@ static inline void nfp_app_ctrl_rx(struct nfp_app *app, struct sk_buff *skb) app->type->ctrl_msg_rx(app, skb); } +static inline void +nfp_app_ctrl_rx_raw(struct nfp_app *app, const void *data, unsigned int len) +{ + trace_devlink_hwmsg(priv_to_devlink(app->pf), true, 0, data, len); + + if (app && app->type->ctrl_msg_rx_raw) + app->type->ctrl_msg_rx_raw(app, data, len); +} + static inline int nfp_app_eswitch_mode_get(struct nfp_app *app, u16 *mode) { if (!app->type->eswitch_mode_get) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index cdfbd1e8bf4b..ca42f7da4f50 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1759,6 +1759,14 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) if (likely(!meta.portid)) { netdev = dp->netdev; + } else if (meta.portid == NFP_META_PORT_ID_CTRL) { + struct nfp_net *nn = netdev_priv(dp->netdev); + + nfp_app_ctrl_rx_raw(nn->app, rxbuf->frag + pkt_off, + pkt_len); + nfp_net_rx_give_one(dp, rx_ring, rxbuf->frag, + rxbuf->dma_addr); + continue; } else { struct nfp_net *nn; @@ -3857,6 +3865,9 @@ int nfp_net_init(struct nfp_net *nn) nn->dp.mtu = NFP_NET_DEFAULT_MTU; nn->dp.fl_bufsz = nfp_net_calc_fl_bufsz(&nn->dp); + if (nfp_app_ctrl_uses_data_vnics(nn->app)) + nn->dp.ctrl |= nn->cap & NFP_NET_CFG_CTRL_CMSG_DATA; + if (nn->cap & NFP_NET_CFG_CTRL_RSS_ANY) { nfp_net_rss_init(nn); nn->dp.ctrl |= nn->cap & NFP_NET_CFG_CTRL_RSS2 ?: diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h index bb63c115537d..44d3ea75d043 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h @@ -127,6 +127,7 @@ #define NFP_NET_CFG_CTRL_GATHER (0x1 << 9) /* Gather DMA */ #define NFP_NET_CFG_CTRL_LSO (0x1 << 10) /* LSO/TSO (version 1) */ #define NFP_NET_CFG_CTRL_CTAG_FILTER (0x1 << 11) /* VLAN CTAG filtering */ +#define NFP_NET_CFG_CTRL_CMSG_DATA (0x1 << 12) /* RX cmsgs on data Qs */ #define NFP_NET_CFG_CTRL_RINGCFG (0x1 << 16) /* Ring runtime changes */ #define NFP_NET_CFG_CTRL_RSS (0x1 << 17) /* RSS (version 1) */ #define NFP_NET_CFG_CTRL_IRQMOD (0x1 << 18) /* Interrupt moderation */ From 20c54204219987d620ca9da567dd54e569863dad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 25 Jul 2018 19:53:32 -0700 Subject: [PATCH 08/45] nfp: bpf: pass raw data buffer to nfp_bpf_event_output() In preparation for SKB-less perf event handling make nfp_bpf_event_output() take buffer address and length, not SKB as parameters. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 5 ++++- drivers/net/ethernet/netronome/nfp/bpf/main.h | 3 ++- .../net/ethernet/netronome/nfp/bpf/offload.c | 21 ++++++++----------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index cb87fccb9f6a..0a89b53962aa 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -441,7 +441,10 @@ void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb) } if (nfp_bpf_cmsg_get_type(skb) == CMSG_TYPE_BPF_EVENT) { - nfp_bpf_event_output(bpf, skb); + if (!nfp_bpf_event_output(bpf, skb->data, skb->len)) + dev_consume_skb_any(skb); + else + dev_kfree_skb_any(skb); return; } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index bec935468f90..e25d3c0c7e43 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -501,7 +501,8 @@ int nfp_bpf_ctrl_lookup_entry(struct bpf_offloaded_map *offmap, int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap, void *key, void *next_key); -int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb); +int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, + unsigned int len); void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb); #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 49b03f7dbf46..293dda84818f 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -453,23 +453,24 @@ nfp_bpf_perf_event_copy(void *dst, const void *src, return 0; } -int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb) +int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, + unsigned int len) { - struct cmsg_bpf_event *cbe = (void *)skb->data; + struct cmsg_bpf_event *cbe = (void *)data; u32 pkt_size, data_size; struct bpf_map *map; - if (skb->len < sizeof(struct cmsg_bpf_event)) - goto err_drop; + if (len < sizeof(struct cmsg_bpf_event)) + return -EINVAL; pkt_size = be32_to_cpu(cbe->pkt_size); data_size = be32_to_cpu(cbe->data_size); map = (void *)(unsigned long)be64_to_cpu(cbe->map_ptr); - if (skb->len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size) - goto err_drop; + if (len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size) + return -EINVAL; if (cbe->hdr.ver != CMSG_MAP_ABI_VERSION) - goto err_drop; + return -EINVAL; rcu_read_lock(); if (!rhashtable_lookup_fast(&bpf->maps_neutral, &map, @@ -477,7 +478,7 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb) rcu_read_unlock(); pr_warn("perf event: dest map pointer %px not recognized, dropping event\n", map); - goto err_drop; + return -EINVAL; } bpf_event_output(map, be32_to_cpu(cbe->cpu_id), @@ -485,11 +486,7 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb) cbe->data, pkt_size, nfp_bpf_perf_event_copy); rcu_read_unlock(); - dev_consume_skb_any(skb); return 0; -err_drop: - dev_kfree_skb_any(skb); - return -EINVAL; } static int From 0958762748e4cfeb19d881aa8d3fe5ba8b5bc50b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 25 Jul 2018 19:53:33 -0700 Subject: [PATCH 09/45] nfp: bpf: allow receiving perf events on data queues Control queue is fairly low latency, and requires SKB allocations, which means we can't even reach 0.5Msps with perf events. Allow perf events to be delivered to data queues. This allows us to not only use multiple queues, but also receive and deliver to user space more than 5Msps per queue (Xeon E5-2630 v4 2.20GHz, no retpolines). Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 18 ++++++++++++++++++ drivers/net/ethernet/netronome/nfp/bpf/main.c | 1 + drivers/net/ethernet/netronome/nfp/bpf/main.h | 3 +++ 3 files changed, 22 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 0a89b53962aa..1946291bf4fd 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -468,3 +468,21 @@ err_unlock: err_free: dev_kfree_skb_any(skb); } + +void +nfp_bpf_ctrl_msg_rx_raw(struct nfp_app *app, const void *data, unsigned int len) +{ + struct nfp_app_bpf *bpf = app->priv; + const struct cmsg_hdr *hdr = data; + + if (unlikely(len < sizeof(struct cmsg_reply_map_simple))) { + cmsg_warn(bpf, "cmsg drop - too short %d!\n", len); + return; + } + + if (hdr->type == CMSG_TYPE_BPF_EVENT) + nfp_bpf_event_output(bpf, data, len); + else + cmsg_warn(bpf, "cmsg drop - msg type %d with raw buffer!\n", + hdr->type); +} diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 994d2b756fe1..192e88981fb2 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -490,6 +490,7 @@ const struct nfp_app_type app_bpf = { .vnic_free = nfp_bpf_vnic_free, .ctrl_msg_rx = nfp_bpf_ctrl_msg_rx, + .ctrl_msg_rx_raw = nfp_bpf_ctrl_msg_rx_raw, .setup_tc = nfp_bpf_setup_tc, .bpf = nfp_ndo_bpf, diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index e25d3c0c7e43..017e0ae5e736 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -505,4 +505,7 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, unsigned int len); void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb); +void +nfp_bpf_ctrl_msg_rx_raw(struct nfp_app *app, const void *data, + unsigned int len); #endif From ab01f4ac5faf6a0ea532fa65cf6b1c9b2019e49b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 25 Jul 2018 19:53:34 -0700 Subject: [PATCH 10/45] nfp: bpf: remember maps by ID Record perf maps by map ID, not raw kernel pointer. This helps with debug messages, because printing pointers to logs is frowned upon, and makes debug easier for the users, as map ID is something they should be more familiar with. Note that perf maps are offload neutral, therefore IDs won't be orphaned. While at it use a rate limited print helper for the error message. Reported-by: Kees Cook Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 2 -- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 12 ++++++---- drivers/net/ethernet/netronome/nfp/bpf/main.c | 4 ++-- drivers/net/ethernet/netronome/nfp/bpf/main.h | 3 +++ .../net/ethernet/netronome/nfp/bpf/offload.c | 22 +++++++++++-------- 5 files changed, 26 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 1946291bf4fd..2572a4b91c7c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -43,8 +43,6 @@ #include "fw.h" #include "main.h" -#define cmsg_warn(bpf, msg...) nn_dp_warn(&(bpf)->app->ctrl->dp, msg) - #define NFP_BPF_TAG_ALLOC_SPAN (U16_MAX / 4) static bool nfp_bpf_all_tags_busy(struct nfp_app_bpf *bpf) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 1d9e36835404..3c22d27de9da 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -3883,6 +3883,7 @@ static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog) struct nfp_insn_meta *meta1, *meta2; struct nfp_bpf_map *nfp_map; struct bpf_map *map; + u32 id; nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) { if (meta1->skip || meta2->skip) @@ -3894,11 +3895,14 @@ static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog) map = (void *)(unsigned long)((u32)meta1->insn.imm | (u64)meta2->insn.imm << 32); - if (bpf_map_offload_neutral(map)) - continue; - nfp_map = map_to_offmap(map)->dev_priv; + if (bpf_map_offload_neutral(map)) { + id = map->id; + } else { + nfp_map = map_to_offmap(map)->dev_priv; + id = nfp_map->tid; + } - meta1->insn.imm = nfp_map->tid; + meta1->insn.imm = id; meta2->insn.imm = 0; } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 192e88981fb2..cce1d2945a32 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -45,8 +45,8 @@ const struct rhashtable_params nfp_bpf_maps_neutral_params = { .nelem_hint = 4, - .key_len = FIELD_SIZEOF(struct nfp_bpf_neutral_map, ptr), - .key_offset = offsetof(struct nfp_bpf_neutral_map, ptr), + .key_len = FIELD_SIZEOF(struct bpf_map, id), + .key_offset = offsetof(struct nfp_bpf_neutral_map, map_id), .head_offset = offsetof(struct nfp_bpf_neutral_map, l), .automatic_shrinking = true, }; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 017e0ae5e736..57573bfa8c03 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -47,6 +47,8 @@ #include "../nfp_asm.h" #include "fw.h" +#define cmsg_warn(bpf, msg...) nn_dp_warn(&(bpf)->app->ctrl->dp, msg) + /* For relocation logic use up-most byte of branch instruction as scratch * area. Remember to clear this before sending instructions to HW! */ @@ -221,6 +223,7 @@ struct nfp_bpf_map { struct nfp_bpf_neutral_map { struct rhash_head l; struct bpf_map *ptr; + u32 map_id; u32 count; }; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 293dda84818f..b1fbb3babc7f 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -67,7 +67,7 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, ASSERT_RTNL(); /* Reuse path - other offloaded program is already tracking this map. */ - record = rhashtable_lookup_fast(&bpf->maps_neutral, &map, + record = rhashtable_lookup_fast(&bpf->maps_neutral, &map->id, nfp_bpf_maps_neutral_params); if (record) { nfp_prog->map_records[nfp_prog->map_records_cnt++] = record; @@ -89,6 +89,7 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, } record->ptr = map; + record->map_id = map->id; record->count = 1; err = rhashtable_insert_fast(&bpf->maps_neutral, &record->l, @@ -457,15 +458,17 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, unsigned int len) { struct cmsg_bpf_event *cbe = (void *)data; - u32 pkt_size, data_size; - struct bpf_map *map; + struct nfp_bpf_neutral_map *record; + u32 pkt_size, data_size, map_id; + u64 map_id_full; if (len < sizeof(struct cmsg_bpf_event)) return -EINVAL; pkt_size = be32_to_cpu(cbe->pkt_size); data_size = be32_to_cpu(cbe->data_size); - map = (void *)(unsigned long)be64_to_cpu(cbe->map_ptr); + map_id_full = be64_to_cpu(cbe->map_ptr); + map_id = map_id_full; if (len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size) return -EINVAL; @@ -473,15 +476,16 @@ int nfp_bpf_event_output(struct nfp_app_bpf *bpf, const void *data, return -EINVAL; rcu_read_lock(); - if (!rhashtable_lookup_fast(&bpf->maps_neutral, &map, - nfp_bpf_maps_neutral_params)) { + record = rhashtable_lookup_fast(&bpf->maps_neutral, &map_id, + nfp_bpf_maps_neutral_params); + if (!record || map_id_full > U32_MAX) { rcu_read_unlock(); - pr_warn("perf event: dest map pointer %px not recognized, dropping event\n", - map); + cmsg_warn(bpf, "perf event: map id %lld (0x%llx) not recognized, dropping event\n", + map_id_full, map_id_full); return -EINVAL; } - bpf_event_output(map, be32_to_cpu(cbe->cpu_id), + bpf_event_output(record->ptr, be32_to_cpu(cbe->cpu_id), &cbe->data[round_up(pkt_size, 4)], data_size, cbe->data, pkt_size, nfp_bpf_perf_event_copy); rcu_read_unlock(); From 17082566a9d2d05eefa3a5d6d968aa1d4b87c73d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 25 Jul 2018 19:53:35 -0700 Subject: [PATCH 11/45] nfp: bpf: improve map offload info messages FW can put constraints on map element size to maximize resource use and efficiency. When user attempts offload of a map which does not fit into those constraints an informational message is printed to kernel logs to inform user about the reason offload failed. Map offload does not have access to any advanced error reporting like verifier log or extack. There is also currently no way for us to nicely expose the FW capabilities to user space. Given all those constraints we should make sure log messages are as informative as possible. Improve them. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- .../net/ethernet/netronome/nfp/bpf/offload.c | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index b1fbb3babc7f..1ccd6371a15b 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -380,11 +380,23 @@ nfp_bpf_map_alloc(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) bpf->maps.max_elems - bpf->map_elems_in_use); return -ENOMEM; } - if (offmap->map.key_size > bpf->maps.max_key_sz || - offmap->map.value_size > bpf->maps.max_val_sz || - round_up(offmap->map.key_size, 8) + + + if (round_up(offmap->map.key_size, 8) + round_up(offmap->map.value_size, 8) > bpf->maps.max_elem_sz) { - pr_info("elements don't fit in device constraints\n"); + pr_info("map elements too large: %u, FW max element size (key+value): %u\n", + round_up(offmap->map.key_size, 8) + + round_up(offmap->map.value_size, 8), + bpf->maps.max_elem_sz); + return -ENOMEM; + } + if (offmap->map.key_size > bpf->maps.max_key_sz) { + pr_info("map key size %u, FW max is %u\n", + offmap->map.key_size, bpf->maps.max_key_sz); + return -ENOMEM; + } + if (offmap->map.value_size > bpf->maps.max_val_sz) { + pr_info("map value size %u, FW max is %u\n", + offmap->map.value_size, bpf->maps.max_val_sz); return -ENOMEM; } From 1e960043e8ae65d6f53b3414586a3fb634461908 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 26 Jul 2018 14:32:18 -0700 Subject: [PATCH 12/45] tools: libbpf: handle NULL program gracefully in bpf_program__nth_fd() bpf_map__fd() handles NULL map gracefully and returns -EINVAL. bpf_program__fd() and bpf_program__nth_fd() crash in this case. Make the behaviour more consistent by validating prog pointer as well. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 955f8eafbf41..afa9860db755 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1991,6 +1991,9 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n) { int fd; + if (!prog) + return -EINVAL; + if (n >= prog->instances.nr || n < 0) { pr_warning("Can't get the %dth fd from program %s: only %d instances\n", n, prog->section_name, prog->instances.nr); From 6d4b198b0b23ca2a75785173a9b27afd1eee7040 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 26 Jul 2018 14:32:19 -0700 Subject: [PATCH 13/45] tools: libbpf: add bpf_object__find_program_by_title() Allow users to find programs by section names. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 12 ++++++++++++ tools/lib/bpf/libbpf.h | 3 +++ 2 files changed, 15 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index afa9860db755..857d3d16968e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -873,6 +873,18 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx) return NULL; } +struct bpf_program * +bpf_object__find_program_by_title(struct bpf_object *obj, const char *title) +{ + struct bpf_program *pos; + + bpf_object__for_each_program(pos, obj) { + if (pos->section_name && !strcmp(pos->section_name, title)) + return pos; + } + return NULL; +} + static int bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, Elf_Data *data, struct bpf_object *obj) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1f8fc2060460..a295fe2f822b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -86,6 +86,9 @@ const char *bpf_object__name(struct bpf_object *obj); unsigned int bpf_object__kversion(struct bpf_object *obj); int bpf_object__btf_fd(const struct bpf_object *obj); +struct bpf_program * +bpf_object__find_program_by_title(struct bpf_object *obj, const char *title); + struct bpf_object *bpf_object__next(struct bpf_object *prev); #define bpf_object__for_each_safe(pos, tmp) \ for ((pos) = bpf_object__next(NULL), \ From e1a40ef418ca2d0ffb6d22f84a13cb0df66cf83c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 26 Jul 2018 14:32:20 -0700 Subject: [PATCH 14/45] samples: bpf: convert xdp_fwd_user.c to libbpf Convert xdp_fwd_user.c to use libbpf instead of bpf_load.o. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- samples/bpf/Makefile | 2 +- samples/bpf/xdp_fwd_user.c | 34 +++++++++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 77f512625b8c..815dec22729d 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -106,7 +106,7 @@ syscall_tp-objs := bpf_load.o syscall_tp_user.o cpustat-objs := bpf_load.o cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o xdpsock-objs := bpf_load.o xdpsock_user.o -xdp_fwd-objs := bpf_load.o xdp_fwd_user.o +xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c index a87a2048ed32..f88e1d7093d6 100644 --- a/samples/bpf/xdp_fwd_user.c +++ b/samples/bpf/xdp_fwd_user.c @@ -24,8 +24,7 @@ #include #include -#include "bpf_load.h" -#include "bpf_util.h" +#include "bpf/libbpf.h" #include @@ -63,9 +62,15 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + const char *prog_name = "xdp_fwd"; + struct bpf_program *prog; char filename[PATH_MAX]; + struct bpf_object *obj; int opt, i, idx, err; - int prog_id = 0; + int prog_fd, map_fd; int attach = 1; int ret = 0; @@ -75,7 +80,7 @@ int main(int argc, char **argv) attach = 0; break; case 'D': - prog_id = 1; + prog_name = "xdp_fwd_direct"; break; default: usage(basename(argv[0])); @@ -90,6 +95,7 @@ int main(int argc, char **argv) if (attach) { snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog_load_attr.file = filename; if (access(filename, O_RDONLY) < 0) { printf("error accessing file %s: %s\n", @@ -97,19 +103,25 @@ int main(int argc, char **argv) return 1; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + return 1; + + prog = bpf_object__find_program_by_title(obj, prog_name); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + printf("program not found: %s\n", strerror(prog_fd)); return 1; } - - if (!prog_fd[prog_id]) { - printf("load_bpf_file: %s\n", strerror(errno)); + map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj, + "tx_port")); + if (map_fd < 0) { + printf("map not found: %s\n", strerror(map_fd)); return 1; } } if (attach) { for (i = 1; i < 64; ++i) - bpf_map_update_elem(map_fd[0], &i, &i, 0); + bpf_map_update_elem(map_fd, &i, &i, 0); } for (i = optind; i < argc; ++i) { @@ -126,7 +138,7 @@ int main(int argc, char **argv) if (err) ret = err; } else { - err = do_attach(idx, prog_fd[prog_id], argv[i]); + err = do_attach(idx, prog_fd, argv[i]); if (err) ret = err; } From 6748182c2d1850811a577fe060387f83d5fa0fe4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 26 Jul 2018 14:32:21 -0700 Subject: [PATCH 15/45] samples: bpf: convert xdpsock_user.c to libbpf Convert xdpsock_user.c to use libbpf instead of bpf_load.o. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- samples/bpf/Makefile | 2 +- samples/bpf/xdpsock_user.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 815dec22729d..f88d5683d6ee 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -105,7 +105,7 @@ xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o syscall_tp_user.o cpustat-objs := bpf_load.o cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o -xdpsock-objs := bpf_load.o xdpsock_user.o +xdpsock-objs := xdpsock_user.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 1e82f7c617c3..4914788b6727 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -26,7 +26,7 @@ #include #include -#include "bpf_load.h" +#include "bpf/libbpf.h" #include "bpf_util.h" #include @@ -891,7 +891,13 @@ static void l2fwd(struct xdpsock *xsk) int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_XDP, + }; + int prog_fd, qidconf_map, xsks_map; + struct bpf_object *obj; char xdp_filename[256]; + struct bpf_map *map; int i, ret, key = 0; pthread_t pt; @@ -904,24 +910,38 @@ int main(int argc, char **argv) } snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); + prog_load_attr.file = xdp_filename; - if (load_bpf_file(xdp_filename)) { - fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf); + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + exit(EXIT_FAILURE); + if (prog_fd < 0) { + fprintf(stderr, "ERROR: no program found: %s\n", + strerror(prog_fd)); exit(EXIT_FAILURE); } - if (!prog_fd[0]) { - fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n", - strerror(errno)); + map = bpf_object__find_map_by_name(obj, "qidconf_map"); + qidconf_map = bpf_map__fd(map); + if (qidconf_map < 0) { + fprintf(stderr, "ERROR: no qidconf map found: %s\n", + strerror(qidconf_map)); exit(EXIT_FAILURE); } - if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) { + map = bpf_object__find_map_by_name(obj, "xsks_map"); + xsks_map = bpf_map__fd(map); + if (xsks_map < 0) { + fprintf(stderr, "ERROR: no xsks map found: %s\n", + strerror(xsks_map)); + exit(EXIT_FAILURE); + } + + if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) { fprintf(stderr, "ERROR: link set xdp fd failed\n"); exit(EXIT_FAILURE); } - ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0); + ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0); if (ret) { fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n"); exit(EXIT_FAILURE); @@ -938,7 +958,7 @@ int main(int argc, char **argv) /* ...and insert them into the map. */ for (i = 0; i < num_socks; i++) { key = i; - ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0); + ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0); if (ret) { fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); exit(EXIT_FAILURE); From d9b9170a26536e9bfe2994d60925e10d16d411dc Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 26 Jul 2018 15:03:02 +1000 Subject: [PATCH 16/45] docs: bpf: Rename README.rst to index.rst Recently bpf/ docs were converted to use RST format. 'README.rst' was created but in order to fit in with the Sphinx build system this file should be named 'index.rst'. Rename file, fixes to integrate into Sphinx build system in following patches. docs: Rename Documentation/bpf/README.rst to Documentation/bpf/index.rst Signed-off-by: Tobin C. Harding Signed-off-by: Daniel Borkmann --- Documentation/bpf/{README.rst => index.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Documentation/bpf/{README.rst => index.rst} (100%) diff --git a/Documentation/bpf/README.rst b/Documentation/bpf/index.rst similarity index 100% rename from Documentation/bpf/README.rst rename to Documentation/bpf/index.rst From b3d40f63d20bbaf78c6b6e9112cde604135e30e0 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 26 Jul 2018 15:03:03 +1000 Subject: [PATCH 17/45] docs: bpf: Add toctree to index Recently bpf/ docs were converted to us RST format. bp/index.rst was created out of README but toctree was not added to include files within Documentation/bpf/ Add toctree to Documentation/bpf/index.rst Signed-off-by: Tobin C. Harding Signed-off-by: Daniel Borkmann --- Documentation/bpf/index.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index b9a80c9e9392..ab2ff13a809b 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -22,14 +22,14 @@ Frequently asked questions (FAQ) Two sets of Questions and Answers (Q&A) are maintained. -* QA for common questions about BPF see: bpf_design_QA_ +.. toctree:: + :maxdepth: 1 -* QA for developers interacting with BPF subsystem: bpf_devel_QA_ + bpf_design_QA + bpf_devel_QA .. Links: -.. _bpf_design_QA: bpf_design_QA.rst -.. _bpf_devel_QA: bpf_devel_QA.rst .. _Documentation/networking/filter.txt: ../networking/filter.txt .. _man-pages: https://www.kernel.org/doc/man-pages/ .. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html From 3209570da7a2dc80e3123dd837ced461f786e6dc Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 26 Jul 2018 15:03:04 +1000 Subject: [PATCH 18/45] docs: Add bpf/index to top level index Recently bpf docs were converted to RST format. The new files were not added to the top level toctree. This causes build system to emit a warning of type WARNING: document isn't included in any toctree Add bpf/index.rst to Documentation/index.rst Signed-off-by: Tobin C. Harding Signed-off-by: Daniel Borkmann --- Documentation/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/index.rst b/Documentation/index.rst index fdc585703498..086710b054db 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -90,6 +90,7 @@ needed). crypto/index filesystems/index vm/index + bpf/index Architecture-specific documentation ----------------------------------- From 6919bcc8aa2755be934d38bb7d8d7aee14b27e1d Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 26 Jul 2018 15:03:05 +1000 Subject: [PATCH 19/45] docs: bpf: Capitalise document heading The majority of files in the kernel documentation index use capitalisation for all words, especially the shorter ones. BPF docs better fit in with the rest of the documentation if the heading is all capitalised. Capitalise document heading. Signed-off-by: Tobin C. Harding Signed-off-by: Daniel Borkmann --- Documentation/bpf/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index ab2ff13a809b..00a8450a602f 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -1,5 +1,5 @@ ================= -BPF documentation +BPF Documentation ================= This directory contains documentation for the BPF (Berkeley Packet From 1ce6a9fc154935d9db771173ecde03fa9b42df4a Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 30 Jul 2018 10:53:23 +0200 Subject: [PATCH 20/45] bpf: fix build error in libbpf with EXTRA_CFLAGS="-Wp, -D_FORTIFY_SOURCE=2 -O2" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 531b014e7a2f ("tools: bpf: make use of reallocarray") causes a compiler error when building the perf tool in the linux-next tree. Compile file tools/lib/bpf/libbpf.c on a FEDORA 28 installation with gcc compiler version: gcc (GCC) 8.0.1 20180324 (Red Hat 8.0.1-0.20) shows this error message: [root@p23lp27] # make V=1 EXTRA_CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -O2" [...] make -f /home6/tmricht/linux-next/tools/build/Makefile.build dir=./util/scripting-engines obj=libperf libbpf.c: In function ‘bpf_object__elf_collect’: libbpf.c:811:15: error: ignoring return value of ‘strerror_r’, declared with attribute warn_unused_result [-Werror=unused-result] strerror_r(-err, errmsg, sizeof(errmsg)); ^ cc1: all warnings being treated as errors mv: cannot stat './.libbpf.o.tmp': No such file or directory /home6/tmricht/linux-next/tools/build/Makefile.build:96: recipe for target 'libbpf.o' failed Replace all occurrences of strerror() by calls to strerror_r(). To keep the compiler quiet also use the return value from strerror_r() otherwise a 'variable set but not use' warning which is treated as error terminates the compile. Fixes: 531b014e7a2f ("tools: bpf: make use of reallocarray") Suggested-by: Jakub Kicinski Suggested-by: Daniel Borkmann Signed-off-by: Thomas Richter Reviewed-by: Hendrik Brueckner Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 857d3d16968e..79fc7ed6995a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -467,8 +467,10 @@ static int bpf_object__elf_init(struct bpf_object *obj) } else { obj->efile.fd = open(obj->path, O_RDONLY); if (obj->efile.fd < 0) { - pr_warning("failed to open %s: %s\n", obj->path, - strerror(errno)); + char errmsg[STRERR_BUFSIZE]; + char *cp = strerror_r(errno, errmsg, sizeof(errmsg)); + + pr_warning("failed to open %s: %s\n", obj->path, cp); return -errno; } @@ -807,10 +809,11 @@ static int bpf_object__elf_collect(struct bpf_object *obj) data->d_size, name, idx); if (err) { char errmsg[STRERR_BUFSIZE]; + char *cp = strerror_r(-err, errmsg, + sizeof(errmsg)); - strerror_r(-err, errmsg, sizeof(errmsg)); pr_warning("failed to alloc program %s (%s): %s", - name, obj->path, errmsg); + name, obj->path, cp); } } else if (sh.sh_type == SHT_REL) { void *reloc = obj->efile.reloc; @@ -1104,6 +1107,7 @@ bpf_object__create_maps(struct bpf_object *obj) for (i = 0; i < obj->nr_maps; i++) { struct bpf_map *map = &obj->maps[i]; struct bpf_map_def *def = &map->def; + char *cp, errmsg[STRERR_BUFSIZE]; int *pfd = &map->fd; if (map->fd >= 0) { @@ -1131,8 +1135,9 @@ bpf_object__create_maps(struct bpf_object *obj) *pfd = bpf_create_map_xattr(&create_attr); if (*pfd < 0 && create_attr.btf_key_type_id) { + cp = strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", - map->name, strerror(errno), errno); + map->name, cp, errno); create_attr.btf_fd = 0; create_attr.btf_key_type_id = 0; create_attr.btf_value_type_id = 0; @@ -1145,9 +1150,9 @@ bpf_object__create_maps(struct bpf_object *obj) size_t j; err = *pfd; + cp = strerror_r(errno, errmsg, sizeof(errmsg)); pr_warning("failed to create map (name: '%s'): %s\n", - map->name, - strerror(errno)); + map->name, cp); for (j = 0; j < i; j++) zclose(obj->maps[j].fd); return err; @@ -1299,6 +1304,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, char *license, u32 kern_version, int *pfd, int prog_ifindex) { struct bpf_load_program_attr load_attr; + char *cp, errmsg[STRERR_BUFSIZE]; char *log_buf; int ret; @@ -1328,7 +1334,8 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, } ret = -LIBBPF_ERRNO__LOAD; - pr_warning("load bpf program failed: %s\n", strerror(errno)); + cp = strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warning("load bpf program failed: %s\n", cp); if (log_buf && log_buf[0] != '\0') { ret = -LIBBPF_ERRNO__VERIFY; @@ -1627,6 +1634,7 @@ out: static int check_path(const char *path) { + char *cp, errmsg[STRERR_BUFSIZE]; struct statfs st_fs; char *dname, *dir; int err = 0; @@ -1640,7 +1648,8 @@ static int check_path(const char *path) dir = dirname(dname); if (statfs(dir, &st_fs)) { - pr_warning("failed to statfs %s: %s\n", dir, strerror(errno)); + cp = strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warning("failed to statfs %s: %s\n", dir, cp); err = -errno; } free(dname); @@ -1656,6 +1665,7 @@ static int check_path(const char *path) int bpf_program__pin_instance(struct bpf_program *prog, const char *path, int instance) { + char *cp, errmsg[STRERR_BUFSIZE]; int err; err = check_path(path); @@ -1674,7 +1684,8 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, } if (bpf_obj_pin(prog->instances.fds[instance], path)) { - pr_warning("failed to pin program: %s\n", strerror(errno)); + cp = strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warning("failed to pin program: %s\n", cp); return -errno; } pr_debug("pinned program '%s'\n", path); @@ -1684,13 +1695,16 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, static int make_dir(const char *path) { + char *cp, errmsg[STRERR_BUFSIZE]; int err = 0; if (mkdir(path, 0700) && errno != EEXIST) err = -errno; - if (err) - pr_warning("failed to mkdir %s: %s\n", path, strerror(-err)); + if (err) { + cp = strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warning("failed to mkdir %s: %s\n", path, cp); + } return err; } @@ -1737,6 +1751,7 @@ int bpf_program__pin(struct bpf_program *prog, const char *path) int bpf_map__pin(struct bpf_map *map, const char *path) { + char *cp, errmsg[STRERR_BUFSIZE]; int err; err = check_path(path); @@ -1749,7 +1764,8 @@ int bpf_map__pin(struct bpf_map *map, const char *path) } if (bpf_obj_pin(map->fd, path)) { - pr_warning("failed to pin map: %s\n", strerror(errno)); + cp = strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warning("failed to pin map: %s\n", cp); return -errno; } From 486cdf21583e5b1fad488a3e4f0a5242a31c0ffa Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Thu, 26 Jul 2018 02:10:40 +0000 Subject: [PATCH 21/45] bpf: add End.DT6 action to bpf_lwt_seg6_action helper The seg6local LWT provides the End.DT6 action, which allows to decapsulate an outer IPv6 header containing a Segment Routing Header (SRH), full specification is available here: https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-05 This patch adds this action now to the seg6local BPF interface. Since it is not mandatory that the inner IPv6 header also contains a SRH, seg6_bpf_srh_state has been extended with a pointer to a possible SRH of the outermost IPv6 header. This helps assessing if the validation must be triggered or not, and avoids some calls to ipv6_find_hdr. v3: s/1/true, s/0/false for boolean values v2: - changed true/false -> 1/0 - preempt_enable no longer called in first conditional block Signed-off-by: Mathieu Xhonneux Signed-off-by: Daniel Borkmann --- include/net/seg6_local.h | 4 +- net/core/filter.c | 88 +++++++++++++++++++++++++++------------- net/ipv6/seg6_local.c | 50 +++++++++++++++-------- 3 files changed, 94 insertions(+), 48 deletions(-) diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 661fd5b4d3e0..08359e2d8b35 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -21,10 +21,12 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); +extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb); struct seg6_bpf_srh_state { - bool valid; + struct ipv6_sr_hdr *srh; u16 hdrlen; + bool valid; }; DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); diff --git a/net/core/filter.c b/net/core/filter.c index 104d560946da..7df1a0f1d1e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4542,26 +4542,28 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; - struct ipv6_sr_hdr *srh; int srhoff = 0; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (srh == NULL) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) - srh_state->valid = 0; + srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; @@ -4577,52 +4579,78 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .arg4_type = ARG_CONST_SIZE }; +static void bpf_update_srh_state(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { + srh_state->srh = NULL; + } else { + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen = srh_state->srh->hdrlen << 3; + srh_state->valid = true; + } +} + BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct ipv6_sr_hdr *srh; - int srhoff = 0; + int hdroff = 0; int err; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - - if (!srh_state->valid) { - if (unlikely((srh_state->hdrlen & 7) != 0)) - return -EBADMSG; - - srh->hdrlen = (u8)(srh_state->hdrlen >> 3); - if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) - return -EBADMSG; - - srh_state->valid = 1; - } - switch (action) { case SEG6_LOCAL_ACTION_END_X: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_DT6: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + + if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) + return -EBADMSG; + if (!pskb_pull(skb, hdroff)) + return -EBADMSG; + + skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + bpf_compute_data_pointers(skb); + bpf_update_srh_state(skb); + return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; default: return -EINVAL; @@ -4644,15 +4672,14 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; - struct ipv6_sr_hdr *srh; struct ipv6hdr *hdr; int srhoff = 0; int ret; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (unlikely(srh == NULL)) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); @@ -4682,8 +4709,11 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; - srh_state->valid = 0; + srh_state->valid = false; return 0; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index e1025b493a18..60325dbfe88b 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -459,36 +459,57 @@ drop: DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +bool seg6_bpf_has_valid_srh(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + + if (unlikely(srh == NULL)) + return false; + + if (unlikely(!srh_state->valid)) { + if ((srh_state->hdrlen & 7) != 0) + return false; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)) + return false; + + srh_state->valid = true; + } + + return true; +} + static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct seg6_bpf_srh_state local_srh_state; struct ipv6_sr_hdr *srh; - int srhoff = 0; int ret; srh = get_and_validate_srh(skb); - if (!srh) - goto drop; + if (!srh) { + kfree_skb(skb); + return -EINVAL; + } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); /* preempt_disable is needed to protect the per-CPU buffer srh_state, * which is also accessed by the bpf_lwt_seg6_* helpers */ preempt_disable(); + srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; - srh_state->valid = 1; + srh_state->valid = true; rcu_read_lock(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); rcu_read_unlock(); - local_srh_state = *srh_state; - preempt_enable(); - switch (ret) { case BPF_OK: case BPF_REDIRECT: @@ -500,24 +521,17 @@ static int input_action_end_bpf(struct sk_buff *skb, goto drop; } - if (unlikely((local_srh_state.hdrlen & 7) != 0)) - goto drop; - - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - goto drop; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); - - if (!local_srh_state.valid && - unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: + preempt_enable(); kfree_skb(skb); return -EINVAL; } From 1f821611f49a89d2258d256efedd618eda6344be Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 30 Jul 2018 22:22:59 +0900 Subject: [PATCH 22/45] lwt_bpf: remove unnecessary rcu_read_lock in run_lwt_bpf run_lwt_bpf is called by bpf_{input/output/xmit}. These functions are already protected by rcu_read_lock. because lwtunnel_{input/output/xmit} holds rcu_read_lock and then calls bpf_{input/output/xmit}. So that rcu_read_lock in the run_lwt_bpf is unnecessary. Signed-off-by: Taehee Yoo Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- net/core/lwt_bpf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index e7e626fb87bb..a49c7baf62f8 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -50,10 +50,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, * mixing with BH RCU lock doesn't work. */ preempt_disable(); - rcu_read_lock(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); - rcu_read_unlock(); switch (ret) { case BPF_OK: From 240b74fde35293f718c6cdfa486b3a005c4243c9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:33:15 -0700 Subject: [PATCH 23/45] nfp: fix variable dereferenced before check in nfp_app_ctrl_rx_raw() 'app' is dereferenced before used for the devlink trace point. In case FW is buggy and sends a control message to a VF queue we should make sure app is not NULL. Reported-by: Dan Carpenter Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_app.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index ccb244cf6c30..4e1eb3395648 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -392,10 +392,11 @@ static inline void nfp_app_ctrl_rx(struct nfp_app *app, struct sk_buff *skb) static inline void nfp_app_ctrl_rx_raw(struct nfp_app *app, const void *data, unsigned int len) { - trace_devlink_hwmsg(priv_to_devlink(app->pf), true, 0, data, len); + if (!app || !app->type->ctrl_msg_rx_raw) + return; - if (app && app->type->ctrl_msg_rx_raw) - app->type->ctrl_msg_rx_raw(app, data, len); + trace_devlink_hwmsg(priv_to_devlink(app->pf), true, 0, data, len); + app->type->ctrl_msg_rx_raw(app, data, len); } static inline int nfp_app_eswitch_mode_get(struct nfp_app *app, u16 *mode) From d692f1138a4bac2efd2c8656ca15556b63479e82 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 30 Jul 2018 17:42:28 -0700 Subject: [PATCH 24/45] bpf: Support bpf_get_socket_cookie in more prog types bpf_get_socket_cookie() helper can be used to identify skb that correspond to the same socket. Though socket cookie can be useful in many other use-cases where socket is available in program context. Specifically BPF_PROG_TYPE_CGROUP_SOCK_ADDR and BPF_PROG_TYPE_SOCK_OPS programs can benefit from it so that one of them can augment a value in a map prepared earlier by other program for the same socket. The patch adds support to call bpf_get_socket_cookie() from BPF_PROG_TYPE_CGROUP_SOCK_ADDR and BPF_PROG_TYPE_SOCK_OPS. It doesn't introduce new helpers. Instead it reuses same helper name bpf_get_socket_cookie() but adds support to this helper to accept `struct bpf_sock_addr` and `struct bpf_sock_ops`. Documentation in bpf.h is changed in a way that should not break automatic generation of markdown. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 14 ++++++++++++++ net/core/filter.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 870113916cac..0ebaaf7f3568 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1371,6 +1371,20 @@ union bpf_attr { * A 8-byte long non-decreasing number on success, or 0 if the * socket field is missing inside *skb*. * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * Return + * A 8-byte long non-decreasing number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * Return + * A 8-byte long non-decreasing number. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket diff --git a/net/core/filter.c b/net/core/filter.c index 7df1a0f1d1e1..9bb9a4488e25 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3812,6 +3812,30 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { + .func = bpf_get_socket_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { + .func = bpf_get_socket_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -4818,6 +4842,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) default: return NULL; } + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_addr_proto; default: return bpf_base_func_proto(func_id); } @@ -4960,6 +4986,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_ops_proto; default: return bpf_base_func_proto(func_id); } From a40b712e4c4d09f6fa234872384da9705443d4e9 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 30 Jul 2018 17:42:29 -0700 Subject: [PATCH 25/45] bpf: Sync bpf.h to tools/ Sync bpf_get_socket_cookie() related bpf UAPI changes to tools/. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 870113916cac..0ebaaf7f3568 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1371,6 +1371,20 @@ union bpf_attr { * A 8-byte long non-decreasing number on success, or 0 if the * socket field is missing inside *skb*. * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * Return + * A 8-byte long non-decreasing number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * Return + * A 8-byte long non-decreasing number. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket From 0289a2cca0a5b75a0167243429d9163ec3fdf279 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 30 Jul 2018 17:42:30 -0700 Subject: [PATCH 26/45] selftests/bpf: Add bpf_get_socket_cookie to bpf_helpers.h Add missing helper to bpf_helpers.h that is used in tests and samples. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index f2f28b6c8915..19a424483f6e 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -65,6 +65,8 @@ static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = (void *) BPF_FUNC_xdp_adjust_head; static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = (void *) BPF_FUNC_xdp_adjust_meta; +static int (*bpf_get_socket_cookie)(void *ctx) = + (void *) BPF_FUNC_get_socket_cookie; static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, int optlen) = (void *) BPF_FUNC_setsockopt; From 194db0d95802fb48d03034fb6bfead1235de3450 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 30 Jul 2018 17:42:31 -0700 Subject: [PATCH 27/45] selftests/bpf: Test for get_socket_cookie Add test to use get_socket_cookie() from BPF programs of types BPF_PROG_TYPE_SOCK_OPS and BPF_PROG_TYPE_CGROUP_SOCK_ADDR. The test attaches two programs to cgroup, runs TCP server and client in the cgroup and checks that two operations are done properly on client socket when user calls connect(2): 1. In BPF_CGROUP_INET6_CONNECT socket cookie is used as the key to write new value in a map for client socket. 2. In BPF_CGROUP_SOCK_OPS (BPF_SOCK_OPS_TCP_CONNECT_CB callback) the value written in "1." is found by socket cookie, since it's the same socket, and updated. Finally the test verifies the value in the map. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 6 +- .../selftests/bpf/socket_cookie_prog.c | 60 +++++ .../selftests/bpf/test_socket_cookie.c | 225 ++++++++++++++++++ 3 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/socket_cookie_prog.c create mode 100644 tools/testing/selftests/bpf/test_socket_cookie.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 478bf1bcbbf5..1b28277998e2 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -22,7 +22,8 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ - test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user + test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ + test_socket_cookie TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -33,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ - get_cgroup_id_kern.o + get_cgroup_id_kern.o socket_cookie_prog.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -60,6 +61,7 @@ $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_sock: cgroup_helpers.c $(OUTPUT)/test_sock_addr: cgroup_helpers.c +$(OUTPUT)/test_socket_cookie: cgroup_helpers.c $(OUTPUT)/test_sockmap: cgroup_helpers.c $(OUTPUT)/test_tcpbpf_user: cgroup_helpers.c $(OUTPUT)/test_progs: trace_helpers.c diff --git a/tools/testing/selftests/bpf/socket_cookie_prog.c b/tools/testing/selftests/bpf/socket_cookie_prog.c new file mode 100644 index 000000000000..9ff8ac4b0bf6 --- /dev/null +++ b/tools/testing/selftests/bpf/socket_cookie_prog.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include +#include + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +struct bpf_map_def SEC("maps") socket_cookies = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__u64), + .value_size = sizeof(__u32), + .max_entries = 1 << 8, +}; + +SEC("cgroup/connect6") +int set_cookie(struct bpf_sock_addr *ctx) +{ + __u32 cookie_value = 0xFF; + __u64 cookie_key; + + if (ctx->family != AF_INET6 || ctx->user_family != AF_INET6) + return 1; + + cookie_key = bpf_get_socket_cookie(ctx); + if (bpf_map_update_elem(&socket_cookies, &cookie_key, &cookie_value, 0)) + return 0; + + return 1; +} + +SEC("sockops") +int update_cookie(struct bpf_sock_ops *ctx) +{ + __u32 new_cookie_value; + __u32 *cookie_value; + __u64 cookie_key; + + if (ctx->family != AF_INET6) + return 1; + + if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB) + return 1; + + cookie_key = bpf_get_socket_cookie(ctx); + + cookie_value = bpf_map_lookup_elem(&socket_cookies, &cookie_key); + if (!cookie_value) + return 1; + + new_cookie_value = (ctx->local_port << 8) | *cookie_value; + bpf_map_update_elem(&socket_cookies, &cookie_key, &new_cookie_value, 0); + + return 1; +} + +int _version SEC("version") = 1; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c new file mode 100644 index 000000000000..68e108e4687a --- /dev/null +++ b/tools/testing/selftests/bpf/test_socket_cookie.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "bpf_rlimit.h" +#include "cgroup_helpers.h" + +#define CG_PATH "/foo" +#define SOCKET_COOKIE_PROG "./socket_cookie_prog.o" + +static int start_server(void) +{ + struct sockaddr_in6 addr; + int fd; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) { + log_err("Failed to create server socket"); + goto out; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin6_family = AF_INET6; + addr.sin6_addr = in6addr_loopback; + addr.sin6_port = 0; + + if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) == -1) { + log_err("Failed to bind server socket"); + goto close_out; + } + + if (listen(fd, 128) == -1) { + log_err("Failed to listen on server socket"); + goto close_out; + } + + goto out; + +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static int connect_to_server(int server_fd) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int fd; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) { + log_err("Failed to create client socket"); + goto out; + } + + if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + goto close_out; + } + + if (connect(fd, (const struct sockaddr *)&addr, len) == -1) { + log_err("Fail to connect to server"); + goto close_out; + } + + goto out; + +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static int validate_map(struct bpf_map *map, int client_fd) +{ + __u32 cookie_expected_value; + struct sockaddr_in6 addr; + socklen_t len = sizeof(addr); + __u32 cookie_value; + __u64 cookie_key; + int err = 0; + int map_fd; + + if (!map) { + log_err("Map not found in BPF object"); + goto err; + } + + map_fd = bpf_map__fd(map); + + err = bpf_map_get_next_key(map_fd, NULL, &cookie_key); + if (err) { + log_err("Can't get cookie key from map"); + goto out; + } + + err = bpf_map_lookup_elem(map_fd, &cookie_key, &cookie_value); + if (err) { + log_err("Can't get cookie value from map"); + goto out; + } + + err = getsockname(client_fd, (struct sockaddr *)&addr, &len); + if (err) { + log_err("Can't get client local addr"); + goto out; + } + + cookie_expected_value = (ntohs(addr.sin6_port) << 8) | 0xFF; + if (cookie_value != cookie_expected_value) { + log_err("Unexpected value in map: %x != %x", cookie_value, + cookie_expected_value); + goto err; + } + + goto out; +err: + err = -1; +out: + return err; +} + +static int run_test(int cgfd) +{ + enum bpf_attach_type attach_type; + struct bpf_prog_load_attr attr; + struct bpf_program *prog; + struct bpf_object *pobj; + const char *prog_name; + int server_fd = -1; + int client_fd = -1; + int prog_fd = -1; + int err = 0; + + memset(&attr, 0, sizeof(attr)); + attr.file = SOCKET_COOKIE_PROG; + attr.prog_type = BPF_PROG_TYPE_UNSPEC; + + err = bpf_prog_load_xattr(&attr, &pobj, &prog_fd); + if (err) { + log_err("Failed to load %s", attr.file); + goto out; + } + + bpf_object__for_each_program(prog, pobj) { + prog_name = bpf_program__title(prog, /*needs_copy*/ false); + + if (strcmp(prog_name, "cgroup/connect6") == 0) { + attach_type = BPF_CGROUP_INET6_CONNECT; + } else if (strcmp(prog_name, "sockops") == 0) { + attach_type = BPF_CGROUP_SOCK_OPS; + } else { + log_err("Unexpected prog: %s", prog_name); + goto err; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgfd, attach_type, + BPF_F_ALLOW_OVERRIDE); + if (err) { + log_err("Failed to attach prog %s", prog_name); + goto out; + } + } + + server_fd = start_server(); + if (server_fd == -1) + goto err; + + client_fd = connect_to_server(server_fd); + if (client_fd == -1) + goto err; + + if (validate_map(bpf_map__next(NULL, pobj), client_fd)) + goto err; + + goto out; +err: + err = -1; +out: + close(client_fd); + close(server_fd); + bpf_object__close(pobj); + printf("%s\n", err ? "FAILED" : "PASSED"); + return err; +} + +int main(int argc, char **argv) +{ + int cgfd = -1; + int err = 0; + + if (setup_cgroup_environment()) + goto err; + + cgfd = create_and_get_cgroup(CG_PATH); + if (!cgfd) + goto err; + + if (join_cgroup(CG_PATH)) + goto err; + + if (run_test(cgfd)) + goto err; + + goto out; +err: + err = -1; +out: + close(cgfd); + cleanup_cgroup_environment(); + return err; +} From fbeb1603bf4e9baa82da8f794de42949d0fe5e25 Mon Sep 17 00:00:00 2001 From: Arthur Fabre Date: Tue, 31 Jul 2018 18:17:22 +0100 Subject: [PATCH 28/45] bpf: verifier: MOV64 don't mark dst reg unbounded When check_alu_op() handles a BPF_MOV64 between two registers, it calls check_reg_arg(DST_OP) on the dst register, marking it as unbounded. If the src and dst register are the same, this marks the src as unbounded, which can lead to unexpected errors for further checks that rely on bounds info. For example: BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_2), BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), Results in: "math between ctx pointer and register with unbounded min value is not allowed" check_alu_op() now uses check_reg_arg(DST_OP_NO_MARK), and MOVs that need to mark the dst register (MOVIMM, MOV32) do so. Added a test case for MOV64 dst == src, and dst != src. Signed-off-by: Arthur Fabre Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 6 +++-- tools/testing/selftests/bpf/test_verifier.c | 26 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25e47c195874..e948303a0ea8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3238,8 +3238,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } - /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + /* check dest operand, mark as required later */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; @@ -3265,6 +3265,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R = imm * remember the value we stored into this reg */ + /* clear any state __mark_reg_known doesn't set */ + mark_reg_unknown(env, regs, insn->dst_reg); regs[insn->dst_reg].type = SCALAR_VALUE; if (BPF_CLASS(insn->code) == BPF_ALU64) { __mark_reg_known(regs + insn->dst_reg, diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f5f7bcc96046..c582afba9d1f 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -12332,6 +12332,32 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "variable ctx access var_off=(0x0; 0x4)", }, + { + "mov64 src == dst", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_2), + // Check bounds are OK + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "mov64 src != dst", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_3), + // Check bounds are OK + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) From 0a4c58f5702858822621fa1177c7d3475f181ccb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:17 -0700 Subject: [PATCH 29/45] bpf: add ability to charge bpf maps memory dynamically This commits extends existing bpf maps memory charging API to support dynamic charging/uncharging. This is required to account memory used by maps, if all entries are created dynamically after the map initialization. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 62 ++++++++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b5ad95cf339..5a4a256473c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -435,6 +435,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a31a1ba0f8ea..7958252a4d29 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,32 +181,60 @@ int bpf_map_precharge_memlock(u32 pages) return 0; } -static int bpf_map_charge_memlock(struct bpf_map *map) +static int bpf_charge_memlock(struct user_struct *user, u32 pages) { - struct user_struct *user = get_current_user(); - unsigned long memlock_limit; + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - atomic_long_add(map->pages, &user->locked_vm); - - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(map->pages, &user->locked_vm); - free_uid(user); + if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); return -EPERM; } - map->user = user; return 0; } -static void bpf_map_uncharge_memlock(struct bpf_map *map) +static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) +{ + atomic_long_sub(pages, &user->locked_vm); +} + +static int bpf_map_init_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + int ret; + + ret = bpf_charge_memlock(user, map->pages); + if (ret) { + free_uid(user); + return ret; + } + map->user = user; + return ret; +} + +static void bpf_map_release_memlock(struct bpf_map *map) { struct user_struct *user = map->user; - - atomic_long_sub(map->pages, &user->locked_vm); + bpf_uncharge_memlock(user, map->pages); free_uid(user); } +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) +{ + int ret; + + ret = bpf_charge_memlock(map->user, pages); + if (ret) + return ret; + map->pages += pages; + return ret; +} + +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) +{ + bpf_uncharge_memlock(map->user, pages); + map->pages -= pages; +} + static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -256,7 +284,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -492,7 +520,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; - err = bpf_map_charge_memlock(map); + err = bpf_map_init_memlock(map); if (err) goto free_map_sec; @@ -515,7 +543,7 @@ static int map_create(union bpf_attr *attr) return err; free_map: - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); free_map_nouncharge: From de9cbbaadba5adf88a19e46df61f7054000838f6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:18 -0700 Subject: [PATCH 30/45] bpf: introduce cgroup storage maps This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps: a special type of maps which are implementing the cgroup storage. >From the userspace point of view it's almost a generic hash map with the (cgroup inode id, attachment type) pair used as a key. The only difference is that some operations are restricted: 1) a user can't create new entries, 2) a user can't remove existing entries. The lookup from userspace is o(log(n)). Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 38 ++++ include/linux/bpf.h | 1 + include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 6 + kernel/bpf/Makefile | 1 + kernel/bpf/local_storage.c | 376 +++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 12 ++ 8 files changed, 440 insertions(+) create mode 100644 kernel/bpf/local_storage.c diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d50c2f0a655a..7d00d58869ed 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,19 +4,39 @@ #include #include +#include #include struct sock; struct sockaddr; struct cgroup; struct sk_buff; +struct bpf_map; +struct bpf_prog; struct bpf_sock_ops_kern; +struct bpf_cgroup_storage; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +struct bpf_cgroup_storage_map; + +struct bpf_storage_buffer { + struct rcu_head rcu; + char data[0]; +}; + +struct bpf_cgroup_storage { + struct bpf_storage_buffer *buf; + struct bpf_cgroup_storage_map *map; + struct bpf_cgroup_storage_key key; + struct list_head list; + struct rb_node node; + struct rcu_head rcu; +}; + struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; @@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type); +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, + struct bpf_map *map) { return 0; } +static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, + struct bpf_map *map) {} +static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( + struct bpf_prog *prog) { return 0; } +static inline void bpf_cgroup_storage_free( + struct bpf_cgroup_storage *storage) {} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5a4a256473c3..9d1e4727495e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -282,6 +282,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ + struct bpf_map *cgroup_storage; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c5700c2d5549..add08be53b6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0ebaaf7f3568..b10118ee5afe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -75,6 +75,11 @@ struct bpf_lpm_trie_key { __u8 data[0]; /* Arbitrary size */ }; +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type */ +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -120,6 +125,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f27f5496d6fe..e8906cbad81f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,6 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c new file mode 100644 index 000000000000..f23d3fdeba23 --- /dev/null +++ b/kernel/bpf/local_storage.c @@ -0,0 +1,376 @@ +//SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CGROUP_BPF + +#define LOCAL_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +struct bpf_cgroup_storage_map { + struct bpf_map map; + + spinlock_t lock; + struct bpf_prog *prog; + struct rb_root root; + struct list_head list; +}; + +static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) +{ + return container_of(map, struct bpf_cgroup_storage_map, map); +} + +static int bpf_cgroup_storage_key_cmp( + const struct bpf_cgroup_storage_key *key1, + const struct bpf_cgroup_storage_key *key2) +{ + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + return 0; +} + +static struct bpf_cgroup_storage *cgroup_storage_lookup( + struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, + bool locked) +{ + struct rb_root *root = &map->root; + struct rb_node *node; + + if (!locked) + spin_lock_bh(&map->lock); + + node = root->rb_node; + while (node) { + struct bpf_cgroup_storage *storage; + + storage = container_of(node, struct bpf_cgroup_storage, node); + + switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + case -1: + node = node->rb_left; + break; + case 1: + node = node->rb_right; + break; + default: + if (!locked) + spin_unlock_bh(&map->lock); + return storage; + } + } + + if (!locked) + spin_unlock_bh(&map->lock); + + return NULL; +} + +static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, + struct bpf_cgroup_storage *storage) +{ + struct rb_root *root = &map->root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct bpf_cgroup_storage *this; + + this = container_of(*new, struct bpf_cgroup_storage, node); + + parent = *new; + switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + case -1: + new = &((*new)->rb_left); + break; + case 1: + new = &((*new)->rb_right); + break; + default: + return -EEXIST; + } + } + + rb_link_node(&storage->node, parent, new); + rb_insert_color(&storage->node, root); + + return 0; +} + +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + + storage = cgroup_storage_lookup(map, key, false); + if (!storage) + return NULL; + + return &READ_ONCE(storage->buf)->data[0]; +} + +static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, + void *value, u64 flags) +{ + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + struct bpf_storage_buffer *new; + + if (flags & BPF_NOEXIST) + return -EINVAL; + + storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, + key, false); + if (!storage) + return -ENOENT; + + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!new) + return -ENOMEM; + + memcpy(&new->data[0], value, map->value_size); + + new = xchg(&storage->buf, new); + kfree_rcu(new, rcu); + + return 0; +} + +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, + void *_next_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage_key *next = _next_key; + struct bpf_cgroup_storage *storage; + + spin_lock_bh(&map->lock); + + if (list_empty(&map->list)) + goto enoent; + + if (key) { + storage = cgroup_storage_lookup(map, key, true); + if (!storage) + goto enoent; + + storage = list_next_entry(storage, list); + if (!storage) + goto enoent; + } else { + storage = list_first_entry(&map->list, + struct bpf_cgroup_storage, list); + } + + spin_unlock_bh(&map->lock); + next->attach_type = storage->key.attach_type; + next->cgroup_inode_id = storage->key.cgroup_inode_id; + return 0; + +enoent: + spin_unlock_bh(&map->lock); + return -ENOENT; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_cgroup_storage_map *map; + + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + return ERR_PTR(-EINVAL); + + if (attr->value_size > PAGE_SIZE) + return ERR_PTR(-E2BIG); + + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + + if (attr->max_entries) + /* max_entries is not used and enforced to be 0 */ + return ERR_PTR(-EINVAL); + + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), + __GFP_ZERO | GFP_USER, numa_node); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), + PAGE_SIZE) >> PAGE_SHIFT; + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&map->map, attr); + + spin_lock_init(&map->lock); + map->root = RB_ROOT; + INIT_LIST_HEAD(&map->list); + + return &map->map; +} + +static void cgroup_storage_map_free(struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + WARN_ON(!RB_EMPTY_ROOT(&map->root)); + WARN_ON(!list_empty(&map->list)); + + kfree(map); +} + +static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +const struct bpf_map_ops cgroup_storage_map_ops = { + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = cgroup_storage_get_next_key, + .map_lookup_elem = cgroup_storage_lookup_elem, + .map_update_elem = cgroup_storage_update_elem, + .map_delete_elem = cgroup_storage_delete_elem, +}; + +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + int ret = -EBUSY; + + spin_lock_bh(&map->lock); + + if (map->prog && map->prog != prog) + goto unlock; + if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + goto unlock; + + map->prog = prog; + prog->aux->cgroup_storage = _map; + ret = 0; +unlock: + spin_unlock_bh(&map->lock); + + return ret; +} + +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + spin_lock_bh(&map->lock); + if (map->prog == prog) { + WARN_ON(prog->aux->cgroup_storage != _map); + map->prog = NULL; + prog->aux->cgroup_storage = NULL; + } + spin_unlock_bh(&map->lock); +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +{ + struct bpf_cgroup_storage *storage; + struct bpf_map *map; + u32 pages; + + map = prog->aux->cgroup_storage; + if (!map) + return NULL; + + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + if (bpf_map_charge_memlock(map, pages)) + return ERR_PTR(-EPERM); + + storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), + __GFP_ZERO | GFP_USER, map->numa_node); + if (!storage) { + bpf_map_uncharge_memlock(map, pages); + return ERR_PTR(-ENOMEM); + } + + storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!storage->buf) { + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); + } + + storage->map = (struct bpf_cgroup_storage_map *)map; + + return storage; +} + +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) +{ + u32 pages; + struct bpf_map *map; + + if (!storage) + return; + + map = &storage->map->map; + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_uncharge_memlock(map, pages); + + kfree_rcu(storage->buf, rcu); + kfree_rcu(storage, rcu); +} + +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type) +{ + struct bpf_cgroup_storage_map *map; + + if (!storage) + return; + + storage->key.attach_type = type; + storage->key.cgroup_inode_id = cgroup->kn->id.id; + + map = storage->map; + + spin_lock_bh(&map->lock); + WARN_ON(cgroup_storage_insert(map, storage)); + list_add(&storage->list, &map->list); + spin_unlock_bh(&map->lock); +} + +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) +{ + struct bpf_cgroup_storage_map *map; + struct rb_root *root; + + if (!storage) + return; + + map = storage->map; + + spin_lock_bh(&map->lock); + root = &map->root; + rb_erase(&storage->node, root); + + list_del(&storage->list); + spin_unlock_bh(&map->lock); +} + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7958252a4d29..5af4e9e2722d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -957,6 +957,9 @@ static void free_used_maps(struct bpf_prog_aux *aux) { int i; + if (aux->cgroup_storage) + bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e948303a0ea8..7e75434a9e54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5154,6 +5154,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + bpf_cgroup_storage_assign(env->prog, map)) { + verbose(env, + "only one cgroup storage is allowed\n"); + fdput(f); + return -EBUSY; + } + fdput(f); next_insn: insn++; @@ -5180,6 +5188,10 @@ static void release_maps(struct bpf_verifier_env *env) { int i; + if (env->prog->aux->cgroup_storage) + bpf_cgroup_storage_release(env->prog, + env->prog->aux->cgroup_storage); + for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); } From aa0ad5b0391e268bdecf6cda31268388844f8afd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:19 -0700 Subject: [PATCH 31/45] bpf: pass a pointer to a cgroup storage using pcpu variable This commit introduces the bpf_cgroup_storage_set() helper, which will be used to pass a pointer to a cgroup storage to the bpf helper. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 15 +++++++++++++++ kernel/bpf/local_storage.c | 2 ++ 2 files changed, 17 insertions(+) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7d00d58869ed..9a144ddbbc8f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -21,6 +22,8 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + struct bpf_cgroup_storage_map; struct bpf_storage_buffer { @@ -97,6 +100,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) +{ + struct bpf_storage_buffer *buf; + + if (!storage) + return; + + buf = READ_ONCE(storage->buf); + this_cpu_write(bpf_cgroup_storage, &buf->data[0]); +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, @@ -250,6 +264,7 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index f23d3fdeba23..fc4e37f68f2a 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,6 +7,8 @@ #include #include +DEFINE_PER_CPU(void*, bpf_cgroup_storage); + #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ From d7bf2c10af053191e931b58704cd862fccb7f9de Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:20 -0700 Subject: [PATCH 32/45] bpf: allocate cgroup storage entries on attaching bpf programs If a bpf program is using cgroup local storage, allocate a bpf_cgroup_storage structure automatically on attaching the program to a cgroup and save the pointer into the corresponding bpf_prog_list entry. Analogically, release the cgroup local storage on detaching of the bpf program. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + kernel/bpf/cgroup.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9a144ddbbc8f..f91b0f8ff3a9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -43,6 +43,7 @@ struct bpf_cgroup_storage { struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; + struct bpf_cgroup_storage *storage; }; struct bpf_prog_array; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index badabb0b435c..935274c86bfe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -188,6 +190,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; + struct bpf_cgroup_storage *storage, *old_storage = NULL; struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; @@ -210,31 +213,47 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return -ENOMEM; + if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) - if (pl->prog == prog) + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) { /* disallow attaching the same prog twice */ + bpf_cgroup_storage_free(storage); return -EINVAL; + } + } pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } + pl_was_allocated = true; pl->prog = prog; + pl->storage = storage; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } pl_was_allocated = true; list_add_tail(&pl->node, progs); } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; + old_storage = pl->storage; + bpf_cgroup_storage_unlink(old_storage); pl_was_allocated = false; } pl->prog = prog; + pl->storage = storage; } cgrp->bpf.flags[type] = flags; @@ -257,10 +276,13 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } static_branch_inc(&cgroup_bpf_enabled_key); + if (old_storage) + bpf_cgroup_storage_free(old_storage); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_cgroup_storage_link(storage, cgrp, type); return 0; cleanup: @@ -276,6 +298,9 @@ cleanup: /* and cleanup the prog list */ pl->prog = old_prog; + bpf_cgroup_storage_free(pl->storage); + pl->storage = old_storage; + bpf_cgroup_storage_link(old_storage, cgrp, type); if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -356,6 +381,8 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ From 394e40a29788820c9c0526b1c3497c9e0ec2a126 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:21 -0700 Subject: [PATCH 33/45] bpf: extend bpf_prog_array to store pointers to the cgroup storage This patch converts bpf_prog_array from an array of prog pointers to the array of struct bpf_prog_array_item elements. This allows to save a cgroup storage pointer for each bpf program efficiently attached to a cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 10 +++-- include/linux/bpf.h | 19 +++++++--- kernel/bpf/cgroup.c | 21 +++++----- kernel/bpf/core.c | 76 +++++++++++++++++++------------------ 4 files changed, 70 insertions(+), 56 deletions(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index fcfab6635f9c..8c26df9b96c1 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -195,14 +195,16 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) */ void lirc_bpf_free(struct rc_dev *rcdev) { - struct bpf_prog **progs; + struct bpf_prog_array_item *item; if (!rcdev->raw->progs) return; - progs = rcu_dereference(rcdev->raw->progs)->progs; - while (*progs) - bpf_prog_put(*progs++); + item = rcu_dereference(rcdev->raw->progs)->items; + while (item->prog) { + bpf_prog_put(item->prog); + item++; + } bpf_prog_array_free(rcdev->raw->progs); } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d1e4727495e..16be67888c30 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -349,9 +349,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ +struct bpf_prog_array_item { + struct bpf_prog *prog; + struct bpf_cgroup_storage *cgroup_storage; +}; + struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog *progs[0]; + struct bpf_prog_array_item items[0]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); @@ -372,7 +377,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ - struct bpf_prog **_prog, *__prog; \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ preempt_disable(); \ @@ -380,10 +386,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ goto _out; \ - _prog = _array->progs; \ - while ((__prog = READ_ONCE(*_prog))) { \ - _ret &= func(__prog, ctx); \ - _prog++; \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + _ret &= func(_prog, ctx); \ + _item++; \ } \ _out: \ rcu_read_unlock(); \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 935274c86bfe..ddfa6cc13e57 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -117,15 +117,18 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - list_for_each_entry(pl, - &p->bpf.progs[type], node) { - if (!pl->prog) - continue; - progs->progs[cnt++] = pl->prog; - } - p = cgroup_parent(p); - } while (p); + if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + list_for_each_entry(pl, &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + + progs->items[cnt].prog = pl->prog; + progs->items[cnt].cgroup_storage = pl->storage; + cnt++; + } + } while ((p = cgroup_parent(p))); rcu_assign_pointer(*array, progs); return 0; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 253aa8e79c7b..9abcf25ebf9f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1542,7 +1542,8 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog *) * (prog_cnt + 1), + sizeof(struct bpf_prog_array_item) * + (prog_cnt + 1), flags); return &empty_prog_array.hdr; @@ -1556,43 +1557,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +int bpf_prog_array_length(struct bpf_prog_array __rcu *array) { - struct bpf_prog **prog; + struct bpf_prog_array_item *item; u32 cnt = 0; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) - if (*prog != &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) cnt++; rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + +static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt) { + struct bpf_prog_array_item *item; int i = 0; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) continue; - prog_ids[i] = (*prog)->aux->id; + prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { - prog++; + item++; break; } } - return !!(*prog); + return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, __u32 __user *prog_ids, u32 cnt) { - struct bpf_prog **prog; unsigned long err = 0; bool nospc; u32 *ids; @@ -1611,8 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, if (!ids) return -ENOMEM; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - nospc = bpf_prog_array_copy_core(prog, ids, cnt); + nospc = bpf_prog_array_copy_core(array, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1623,14 +1625,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, struct bpf_prog *old_prog) { - struct bpf_prog **prog = progs->progs; + struct bpf_prog_array_item *item = array->items; - for (; *prog; prog++) - if (*prog == old_prog) { - WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + for (; item->prog; item++) + if (item->prog == old_prog) { + WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } @@ -1641,7 +1643,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog **existing_prog; + struct bpf_prog_array_item *existing; struct bpf_prog_array *array; bool found_exclude = false; int new_prog_idx = 0; @@ -1650,15 +1652,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, * the new array. */ if (old_array) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) { - if (*existing_prog == exclude_prog) { + existing = old_array->items; + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog) { found_exclude = true; continue; } - if (*existing_prog != &dummy_bpf_prog.prog) + if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; - if (*existing_prog == include_prog) + if (existing->prog == include_prog) return -EEXIST; } } @@ -1684,15 +1686,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, /* Fill in the new prog array */ if (carry_prog_cnt) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) - array->progs[new_prog_idx++] = *existing_prog; + existing = old_array->items; + for (; existing->prog; existing++) + if (existing->prog != exclude_prog && + existing->prog != &dummy_bpf_prog.prog) { + array->items[new_prog_idx++].prog = + existing->prog; + } } if (include_prog) - array->progs[new_prog_idx++] = include_prog; - array->progs[new_prog_idx] = NULL; + array->items[new_prog_idx++].prog = include_prog; + array->items[new_prog_idx].prog = NULL; *new_array = array; return 0; } @@ -1701,7 +1705,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { - struct bpf_prog **prog; u32 cnt = 0; if (array) @@ -1714,8 +1717,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ - prog = rcu_dereference_check(array, 1)->progs; - return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } From 3e6a4b3e0289dc9540a2c1d8a20657f4707fbabb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:22 -0700 Subject: [PATCH 34/45] bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way that the access from the bpf program side is lookup-free. That means the result is guaranteed to be a valid pointer to the cgroup storage; no NULL-check is required. This patch introduces BPF_PTR_TO_MAP_VALUE return type, which is required to cause the verifier accept programs, which are not checking the map value pointer for being NULL. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16be67888c30..ca4ac2a39def 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_arg_type { enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7e75434a9e54..1ede16c8bb40 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2545,8 +2545,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || + fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + else + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; From 7b5dd2bde72cd33313b63cf3ba1de6a9e443a65d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:23 -0700 Subject: [PATCH 35/45] bpf: don't allow create maps of cgroup local storages As there is one-to-one relation between a bpf program and cgroup local storage map, there is no sense in creating a map of cgroup local storage maps. Forbid it explicitly to avoid possible side effects. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/map_in_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1da574612bea..3bfbf4464416 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -23,7 +23,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * is a runtime binding. Doing static check alone * in the verifier is not enough. */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:24 -0700 Subject: [PATCH 36/45] bpf: introduce the bpf_get_local_storage() helper function The bpf_get_local_storage() helper function is used to get a pointer to the bpf local storage from a bpf program. It takes a pointer to a storage map and flags as arguments. Right now it accepts only cgroup storage maps, and flags argument has to be 0. Further it can be extended to support other types of local storage: e.g. thread local storage etc. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 20 ++++++++++++++++++++ kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 23 ++++++++++++++++++++++- 7 files changed, 85 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ca4ac2a39def..cd8790d2c6ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -788,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_local_storage_proto; + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b10118ee5afe..dd5758dc35d3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2095,6 +2095,24 @@ union bpf_attr { * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. + * + * void* get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the bpf program type, a local storage area + * can be shared between multiple instances of the bpf program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the BPF_STX_XADD instruction to alter + * the shared data. + * Return + * Pointer to the local storage area. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2177,7 +2195,8 @@ union bpf_attr { FN(rc_repeat), \ FN(rc_keydown), \ FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), + FN(get_current_cgroup_id), \ + FN(get_local_storage), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddfa6cc13e57..0a4fe5a7dc91 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9abcf25ebf9f..4d09e610777f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1795,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 73065e2d23c2..1991466b8327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* map and flags arguments are not used now, + * but provide an ability to extend the API + * for other types of local storages. + * verifier checks that their values are correct. + */ + return (unsigned long) this_cpu_read(bpf_cgroup_storage); +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ede16c8bb40..587468a9c37d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2127,6 +2127,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + if (func_id != BPF_FUNC_get_local_storage) + goto error; + break; /* devmap returns a pointer to a live net_device ifindex that we cannot * allow to be modified from bpf side. So do not allow lookup elements * for now. @@ -2209,6 +2213,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; + case BPF_FUNC_get_local_storage: + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + goto error; + break; default: break; } @@ -2533,6 +2541,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs = cur_regs(env); + + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (func_id == BPF_FUNC_get_local_storage && + !register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/net/core/filter.c b/net/core/filter.c index 9bb9a4488e25..9f73aae2f089 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4820,6 +4820,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4844,6 +4846,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4866,6 +4870,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + default: + return sk_filter_func_proto(func_id, prog); + } +} + static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4988,6 +5003,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5007,6 +5024,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5034,6 +5053,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -6838,7 +6859,7 @@ const struct bpf_prog_ops xdp_prog_ops = { }; const struct bpf_verifier_ops cg_skb_verifier_ops = { - .get_func_proto = sk_filter_func_proto, + .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; From c419cf52da77d0f0f01d4b75a75ae89406f0923f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:25 -0700 Subject: [PATCH 37/45] bpf: sync bpf.h to tools/ Sync cgroup storage related changes: 1) new BPF_MAP_TYPE_CGROUP_STORAGE map type 2) struct bpf_cgroup_sotrage_key definition 3) get_local_storage() helper Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0ebaaf7f3568..dd5758dc35d3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -75,6 +75,11 @@ struct bpf_lpm_trie_key { __u8 data[0]; /* Arbitrary size */ }; +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type */ +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -120,6 +125,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE, }; enum bpf_prog_type { @@ -2089,6 +2095,24 @@ union bpf_attr { * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. + * + * void* get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the bpf program type, a local storage area + * can be shared between multiple instances of the bpf program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the BPF_STX_XADD instruction to alter + * the shared data. + * Return + * Pointer to the local storage area. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2171,7 +2195,8 @@ union bpf_attr { FN(rc_repeat), \ FN(rc_keydown), \ FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), + FN(get_current_cgroup_id), \ + FN(get_local_storage), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call From 34a6bbb8131b75daeb3e9be026d39d6463762baf Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:26 -0700 Subject: [PATCH 38/45] bpftool: add support for CGROUP_STORAGE maps Add BPF_MAP_TYPE_CGROUP_STORAGE maps to the list of maps types which bpftool recognizes. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Jakub Kicinski Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 0ee3ba479d87..2dd1f8d9cc2d 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -68,6 +68,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_SOCKMAP] = "sockmap", [BPF_MAP_TYPE_CPUMAP] = "cpumap", [BPF_MAP_TYPE_SOCKHASH] = "sockhash", + [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", }; static bool map_is_per_cpu(__u32 type) From f42ee093be2980f2689ea7a170d580364820f48b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:27 -0700 Subject: [PATCH 39/45] bpf/test_run: support cgroup local storage Allocate a temporary cgroup storage to use for bpf program test runs. Because the test program is not actually attached to a cgroup, the storage is allocated manually just for the execution of the bpf program. If the program is executed multiple times, the storage is not zeroed on each run, emulating multiple runs of the program, attached to a real cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 22a78eedf4b1..f4078830ea50 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -11,12 +11,14 @@ #include #include -static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) +static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, + struct bpf_cgroup_storage *storage) { u32 ret; preempt_disable(); rcu_read_lock(); + bpf_cgroup_storage_set(storage); ret = BPF_PROG_RUN(prog, ctx); rcu_read_unlock(); preempt_enable(); @@ -26,14 +28,19 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { + struct bpf_cgroup_storage *storage = NULL; u64 time_start, time_spent = 0; u32 ret = 0, i; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return PTR_ERR(storage); + if (!repeat) repeat = 1; time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - ret = bpf_test_run_one(prog, ctx); + ret = bpf_test_run_one(prog, ctx, storage); if (need_resched()) { if (signal_pending(current)) break; @@ -46,6 +53,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + bpf_cgroup_storage_free(storage); + return ret; } From d4c9f573537506530019c505c8b6097a7d7a5fab Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:28 -0700 Subject: [PATCH 40/45] selftests/bpf: add verifier cgroup storage tests Add the following verifier tests to cover the cgroup storage functionality: 1) valid access to the cgroup storage 2) invalid access: use regular hashmap instead of cgroup storage map 3) invalid access: use invalid map fd 4) invalid access: try access memory after the cgroup storage 5) invalid access: try access memory before the cgroup storage 6) invalid access: call get_local_storage() with non-zero flags For tests 2)-6) check returned error strings. Expected output: $ ./test_verifier #0/u add+sub+mul OK #0/p add+sub+mul OK #1/u DIV32 by 0, zero check 1 OK ... #280/p valid cgroup storage access OK #281/p invalid cgroup storage access 1 OK #282/p invalid cgroup storage access 2 OK #283/p invalid per-cgroup storage access 3 OK #284/p invalid cgroup storage access 4 OK #285/p invalid cgroup storage access 5 OK ... #649/p pass modified ctx pointer to helper, 2 OK #650/p pass modified ctx pointer to helper, 3 OK Summary: 901 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 2 + tools/testing/selftests/bpf/test_verifier.c | 140 +++++++++++++++++++- 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 19a424483f6e..cb9fcfbc9307 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -135,6 +135,8 @@ static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol, (void *) BPF_FUNC_rc_keydown; static unsigned long long (*bpf_get_current_cgroup_id)(void) = (void *) BPF_FUNC_get_current_cgroup_id; +static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = + (void *) BPF_FUNC_get_local_storage; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index c582afba9d1f..4b5e03c25204 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 7 +#define MAX_NR_MAPS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -70,6 +70,7 @@ struct bpf_test { int fixup_prog1[MAX_FIXUPS]; int fixup_prog2[MAX_FIXUPS]; int fixup_map_in_map[MAX_FIXUPS]; + int fixup_cgroup_storage[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t retval; @@ -4630,6 +4631,121 @@ static struct bpf_test tests[] = { .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, + { + "valid cgroup storage access", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_cgroup_storage = { 1 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "invalid cgroup storage access 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 1 }, + .result = REJECT, + .errstr = "cannot pass map_type 1 into func bpf_get_local_storage", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "invalid cgroup storage access 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_LD_MAP_FD(BPF_REG_1, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "fd 1 is not pointing to valid bpf_map", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "invalid per-cgroup storage access 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 256), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_cgroup_storage = { 1 }, + .result = REJECT, + .errstr = "invalid access to map value, value_size=64 off=256 size=4", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "invalid cgroup storage access 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, -2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + BPF_EXIT_INSN(), + }, + .fixup_cgroup_storage = { 1 }, + .result = REJECT, + .errstr = "invalid access to map value, value_size=64 off=-2 size=4", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "invalid cgroup storage access 5", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 7), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_cgroup_storage = { 1 }, + .result = REJECT, + .errstr = "get_local_storage() doesn't support non-zero flags", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "invalid cgroup storage access 6", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_cgroup_storage = { 1 }, + .result = REJECT, + .errstr = "get_local_storage() doesn't support non-zero flags", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, { "multiple registers share map_lookup_elem result", .insns = { @@ -12462,6 +12578,19 @@ static int create_map_in_map(void) return outer_map_fd; } +static int create_cgroup_storage(void) +{ + int fd; + + fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, + sizeof(struct bpf_cgroup_storage_key), + TEST_DATA_LEN, 0, 0); + if (fd < 0) + printf("Failed to create array '%s'!\n", strerror(errno)); + + return fd; +} + static char bpf_vlog[UINT_MAX >> 8]; static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, @@ -12474,6 +12603,7 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, int *fixup_prog1 = test->fixup_prog1; int *fixup_prog2 = test->fixup_prog2; int *fixup_map_in_map = test->fixup_map_in_map; + int *fixup_cgroup_storage = test->fixup_cgroup_storage; if (test->fill_helper) test->fill_helper(test); @@ -12541,6 +12671,14 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, fixup_map_in_map++; } while (*fixup_map_in_map); } + + if (*fixup_cgroup_storage) { + map_fds[7] = create_cgroup_storage(); + do { + prog[*fixup_cgroup_storage].imm = map_fds[7]; + fixup_cgroup_storage++; + } while (*fixup_cgroup_storage); + } } static void do_test_single(struct bpf_test *test, bool unpriv, From 68cfa3ac6b8db2782300ad0d699da27aaa2ac9fb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:29 -0700 Subject: [PATCH 41/45] selftests/bpf: add a cgroup storage test Implement a test to cover the cgroup storage functionality. The test implements a bpf program which drops every second packet by using the cgroup storage as a persistent storage. The test also use the userspace API to check the data in the cgroup storage, alter it, and check that the loaded and attached bpf program sees the update. Expected output: $ ./test_cgroup_storage test_cgroup_storage:PASS Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/test_cgroup_storage.c | 130 ++++++++++++++++++ 2 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/test_cgroup_storage.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1b28277998e2..ad241ddba350 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -23,7 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ - test_socket_cookie + test_socket_cookie test_cgroup_storage TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -66,6 +66,7 @@ $(OUTPUT)/test_sockmap: cgroup_helpers.c $(OUTPUT)/test_tcpbpf_user: cgroup_helpers.c $(OUTPUT)/test_progs: trace_helpers.c $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c +$(OUTPUT)/test_cgroup_storage: cgroup_helpers.c .PHONY: force diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c new file mode 100644 index 000000000000..dc83fb2d3f27 --- /dev/null +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "cgroup_helpers.h" + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/" + +int main(int argc, char **argv) +{ + struct bpf_insn prog[] = { + BPF_LD_MAP_FD(BPF_REG_1, 0), /* map fd */ + BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_local_storage), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + int error = EXIT_FAILURE; + int map_fd, prog_fd, cgroup_fd; + struct bpf_cgroup_storage_key key; + unsigned long long value; + + map_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, sizeof(key), + sizeof(value), 0, 0); + if (map_fd < 0) { + printf("Failed to create map: %s\n", strerror(errno)); + goto out; + } + + prog[0].imm = map_fd; + prog_fd = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); + if (prog_fd < 0) { + printf("Failed to load bpf program: %s\n", bpf_log_buf); + goto out; + } + + if (setup_cgroup_environment()) { + printf("Failed to setup cgroup environment\n"); + goto err; + } + + /* Create a cgroup, get fd, and join it */ + cgroup_fd = create_and_get_cgroup(TEST_CGROUP); + if (!cgroup_fd) { + printf("Failed to create test cgroup\n"); + goto err; + } + + if (join_cgroup(TEST_CGROUP)) { + printf("Failed to join cgroup\n"); + goto err; + } + + /* Attach the bpf program */ + if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) { + printf("Failed to attach bpf program\n"); + goto err; + } + + if (bpf_map_get_next_key(map_fd, NULL, &key)) { + printf("Failed to get the first key in cgroup storage\n"); + goto err; + } + + if (bpf_map_lookup_elem(map_fd, &key, &value)) { + printf("Failed to lookup cgroup storage\n"); + goto err; + } + + /* Every second packet should be dropped */ + assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); + assert(system("ping localhost -c 1 -W 1 -q > /dev/null")); + assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); + + /* Check the counter in the cgroup local storage */ + if (bpf_map_lookup_elem(map_fd, &key, &value)) { + printf("Failed to lookup cgroup storage\n"); + goto err; + } + + if (value != 3) { + printf("Unexpected data in the cgroup storage: %llu\n", value); + goto err; + } + + /* Bump the counter in the cgroup local storage */ + value++; + if (bpf_map_update_elem(map_fd, &key, &value, 0)) { + printf("Failed to update the data in the cgroup storage\n"); + goto err; + } + + /* Every second packet should be dropped */ + assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); + assert(system("ping localhost -c 1 -W 1 -q > /dev/null")); + assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); + + /* Check the final value of the counter in the cgroup local storage */ + if (bpf_map_lookup_elem(map_fd, &key, &value)) { + printf("Failed to lookup the cgroup storage\n"); + goto err; + } + + if (value != 7) { + printf("Unexpected data in the cgroup storage: %llu\n", value); + goto err; + } + + error = 0; + printf("test_cgroup_storage:PASS\n"); + +err: + cleanup_cgroup_environment(); + +out: + return error; +} From 28ba068760a7e136a7fe2783bca74e3f43affb9b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:30 -0700 Subject: [PATCH 42/45] samples/bpf: extend test_cgrp2_attach2 test to use cgroup storage The test_cgrp2_attach test covers bpf cgroup attachment code well, so let's re-use it for testing allocation/releasing of cgroup storage. The extension is pretty straightforward: the bpf program will use the cgroup storage to save the number of transmitted bytes. Expected output: $ ./test_cgrp2_attach2 Attached DROP prog. This ping in cgroup /foo should fail... ping: sendmsg: Operation not permitted Attached DROP prog. This ping in cgroup /foo/bar should fail... ping: sendmsg: Operation not permitted Attached PASS prog. This ping in cgroup /foo/bar should pass... Detached PASS from /foo/bar while DROP is attached to /foo. This ping in cgroup /foo/bar should fail... ping: sendmsg: Operation not permitted Attached PASS from /foo/bar and detached DROP from /foo. This ping in cgroup /foo/bar should pass... ### override:PASS ### multi:PASS Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- samples/bpf/test_cgrp2_attach2.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index b453e6a161be..180f9d813bca 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -8,7 +8,8 @@ * information. The number of invocations of the program, which maps * to the number of packets received, is stored to key 0. Key 1 is * incremented on each iteration by the number of bytes stored in - * the skb. + * the skb. The program also stores the number of received bytes + * in the cgroup storage. * * - Attaches the new program to a cgroup using BPF_PROG_ATTACH * @@ -21,12 +22,15 @@ #include #include #include +#include +#include #include #include #include #include "bpf_insn.h" +#include "bpf_rlimit.h" #include "cgroup_helpers.h" #define FOO "/foo" @@ -205,6 +209,8 @@ static int map_fd = -1; static int prog_load_cnt(int verdict, int val) { + int cgroup_storage_fd; + if (map_fd < 0) map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); if (map_fd < 0) { @@ -212,6 +218,13 @@ static int prog_load_cnt(int verdict, int val) return -1; } + cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, + sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); + if (cgroup_storage_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + struct bpf_insn prog[] = { BPF_MOV32_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ @@ -222,6 +235,11 @@ static int prog_load_cnt(int verdict, int val) BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), + BPF_MOV64_IMM(BPF_REG_1, val), + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ BPF_EXIT_INSN(), }; @@ -237,6 +255,7 @@ static int prog_load_cnt(int verdict, int val) printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); return 0; } + close(cgroup_storage_fd); return ret; } From 0069fb854364da79fd99236ea620affc8e1152d5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 15:47:10 -0700 Subject: [PATCH 43/45] selftests/bpf: fix a typo in map in map test Commit fbeb1603bf4e ("bpf: verifier: MOV64 don't mark dst reg unbounded") revealed a typo in commit fb30d4b71214 ("bpf: Add tests for map-in-map"): BPF_MOV64_REG(BPF_REG_0, 0) was used instead of BPF_MOV64_IMM(BPF_REG_0, 0). I've noticed the problem by running bpf kselftests. Fixes: fb30d4b71214 ("bpf: Add tests for map-in-map") Signed-off-by: Roman Gushchin Cc: Martin KaFai Lau Cc: Arthur Fabre Cc: Daniel Borkmann Cc: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 4b5e03c25204..ac281ee771dd 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -7113,7 +7113,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_MOV64_REG(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_in_map = { 3 }, @@ -7136,7 +7136,7 @@ static struct bpf_test tests[] = { BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_MOV64_REG(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_in_map = { 3 }, @@ -7158,7 +7158,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_MOV64_REG(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_in_map = { 3 }, From 0c26159352ba1cdc5a8c8d74131cc19cdfdf9371 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Aug 2018 22:06:00 -0700 Subject: [PATCH 44/45] nfp: bpf: xdp_adjust_tail support Add support for adjust_tail. There are no FW changes needed but add a FW capability just in case there would be any issue with previously released FW, or we will have to change the ABI in the future. The helper is trivial and shouldn't be used too often so just inline the body of the function. We add the delta to locally maintained packet length register and check for overflow, since add of negative value must overflow if result is positive. Note that if delta of 0 would be allowed in the kernel this trick stops working and we need one more instruction to compare lengths before and after the change. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/fw.h | 1 + drivers/net/ethernet/netronome/nfp/bpf/jit.c | 47 +++++++++++++++++++ drivers/net/ethernet/netronome/nfp/bpf/main.c | 13 +++++ drivers/net/ethernet/netronome/nfp/bpf/main.h | 2 + .../net/ethernet/netronome/nfp/bpf/verifier.c | 7 +++ drivers/net/ethernet/netronome/nfp/nfp_asm.h | 1 + 6 files changed, 71 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index 4c7972e3db63..e4f9b7ec8528 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -51,6 +51,7 @@ enum bpf_cap_tlv_type { NFP_BPF_CAP_TYPE_MAPS = 3, NFP_BPF_CAP_TYPE_RANDOM = 4, NFP_BPF_CAP_TYPE_QUEUE_SELECT = 5, + NFP_BPF_CAP_TYPE_ADJUST_TAIL = 6, }; struct nfp_bpf_cap_tlv_func { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 3c22d27de9da..eff57f7d056a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1642,6 +1642,51 @@ static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return 0; } +static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + u32 ret_einval, end; + swreg plen, delta; + + BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN)); + + plen = imm_a(nfp_prog); + delta = reg_a(2 * 2); + + ret_einval = nfp_prog_current_offset(nfp_prog) + 9; + end = nfp_prog_current_offset(nfp_prog) + 11; + + /* Calculate resulting length */ + emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta); + /* delta == 0 is not allowed by the kernel, add must overflow to make + * length smaller. + */ + emit_br(nfp_prog, BR_BCC, ret_einval, 0); + + /* if (new_len < 14) then -EINVAL */ + emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN)); + emit_br(nfp_prog, BR_BMI, ret_einval, 0); + + emit_alu(nfp_prog, plen_reg(nfp_prog), + plen_reg(nfp_prog), ALU_OP_ADD, delta); + emit_alu(nfp_prog, pv_len(nfp_prog), + pv_len(nfp_prog), ALU_OP_ADD, delta); + + emit_br(nfp_prog, BR_UNC, end, 2); + wrp_immed(nfp_prog, reg_both(0), 0); + wrp_immed(nfp_prog, reg_both(1), 0); + + if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval)) + return -EINVAL; + + wrp_immed(nfp_prog, reg_both(0), -22); + wrp_immed(nfp_prog, reg_both(1), ~0); + + if (!nfp_prog_confirm_current_offset(nfp_prog, end)) + return -EINVAL; + + return 0; +} + static int map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { @@ -3041,6 +3086,8 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) switch (meta->insn.imm) { case BPF_FUNC_xdp_adjust_head: return adjust_head(nfp_prog, meta); + case BPF_FUNC_xdp_adjust_tail: + return adjust_tail(nfp_prog, meta); case BPF_FUNC_map_lookup_elem: case BPF_FUNC_map_update_elem: case BPF_FUNC_map_delete_elem: diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index cce1d2945a32..970af07f4656 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -334,6 +334,14 @@ nfp_bpf_parse_cap_qsel(struct nfp_app_bpf *bpf, void __iomem *value, u32 length) return 0; } +static int +nfp_bpf_parse_cap_adjust_tail(struct nfp_app_bpf *bpf, void __iomem *value, + u32 length) +{ + bpf->adjust_tail = true; + return 0; +} + static int nfp_bpf_parse_capabilities(struct nfp_app *app) { struct nfp_cpp *cpp = app->pf->cpp; @@ -380,6 +388,11 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app) if (nfp_bpf_parse_cap_qsel(app->priv, value, length)) goto err_release_free; break; + case NFP_BPF_CAP_TYPE_ADJUST_TAIL: + if (nfp_bpf_parse_cap_adjust_tail(app->priv, value, + length)) + goto err_release_free; + break; default: nfp_dbg(cpp, "unknown BPF capability: %d\n", type); break; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 57573bfa8c03..dbd00982fd2b 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -150,6 +150,7 @@ enum pkt_vec { * * @pseudo_random: FW initialized the pseudo-random machinery (CSRs) * @queue_select: BPF can set the RX queue ID in packet vector + * @adjust_tail: BPF can simply trunc packet size for adjust tail */ struct nfp_app_bpf { struct nfp_app *app; @@ -195,6 +196,7 @@ struct nfp_app_bpf { bool pseudo_random; bool queue_select; + bool adjust_tail; }; enum nfp_bpf_map_use { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 49ba0d645d36..a6e9248669e1 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -178,6 +178,13 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, nfp_record_adjust_head(bpf, nfp_prog, meta, reg2); break; + case BPF_FUNC_xdp_adjust_tail: + if (!bpf->adjust_tail) { + pr_vlog(env, "adjust_tail not supported by FW\n"); + return -EOPNOTSUPP; + } + break; + case BPF_FUNC_map_lookup_elem: if (!nfp_bpf_map_call_ok("map_lookup", env, meta, bpf->helpers.map_lookup, reg1) || diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h index cdc4e065f6f5..fad0e62a910c 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h @@ -93,6 +93,7 @@ enum br_mask { BR_BNE = 0x01, BR_BMI = 0x02, BR_BHS = 0x04, + BR_BCC = 0x05, BR_BLO = 0x05, BR_BGE = 0x08, BR_BLT = 0x09, From 85fc4b16aaf05fc8978d242c556a1711dce15cf8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Aug 2018 14:27:28 -0700 Subject: [PATCH 45/45] bpf: introduce update_effective_progs() __cgroup_bpf_attach() and __cgroup_bpf_detach() functions have a good amount of duplicated code, which is possible to eliminate by introducing the update_effective_progs() helper function. The update_effective_progs() calls compute_effective_progs() and then in case of success it calls activate_effective_progs() for each descendant cgroup. In case of failure (OOM), it releases allocated prog arrays and return the error code. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 99 +++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 54 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a4fe5a7dc91..6a7d931bbc55 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -177,6 +177,45 @@ cleanup: return -ENOMEM; } +static int update_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type) +{ + struct cgroup_subsys_state *css; + int err; + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return err; +} + #define BPF_CGROUP_MAX_PROGS 64 /** @@ -194,7 +233,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage, *old_storage = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -261,22 +299,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = flags; - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); if (old_storage) @@ -289,16 +314,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and cleanup the prog list */ pl->prog = old_prog; bpf_cgroup_storage_free(pl->storage); @@ -326,7 +341,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; int err; @@ -365,22 +379,9 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; } - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; /* now can actually delete it from this cgroup list */ list_del(&pl->node); @@ -396,16 +397,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and restore back old_prog */ pl->prog = old_prog; return err;