From 03ad047b700c2dcb7128888d946a092e2353b497 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 29 Jul 2024 17:03:01 +0200
Subject: [PATCH 1/5] xdp-forward: Introduce xdp-fwd-flowtable bpf sample

Introduce xdp-fwd-flowtable sample in order to perform XDP_REDIRECT
between net_devices inserted in a netfilter flowtable.
xdp-fwd-flowtable relies on bpf_xdp_flow_lookup kfunc in order to
perform the lookup of a given flowtable entry based on a fib tuple of
incoming traffic. At the moment we are able to offload just TCP or UDP
netfilter flowtable entries to the xdp layer. The user is supposed to
configure the flowtable separately.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 headers/linux/hlist.h           |   4 +
 headers/linux/netfilter.h       | 114 ++++++
 xdp-forward/Makefile            |   2 +-
 xdp-forward/xdp_flowtable.bpf.c | 609 ++++++++++++++++++++++++++++++++
 4 files changed, 728 insertions(+), 1 deletion(-)
 create mode 100644 headers/linux/netfilter.h
 create mode 100644 xdp-forward/xdp_flowtable.bpf.c

diff --git a/headers/linux/hlist.h b/headers/linux/hlist.h
index a451b49c..ae35508b 100644
--- a/headers/linux/hlist.h
+++ b/headers/linux/hlist.h
@@ -5,6 +5,10 @@
 
 struct list_head;
 
+struct rhash_head {
+	struct rhash_head *next;
+};
+
 #define HLIST_POISON_POINTER_DELTA 0
 #define HLIST_POISON1  ((void *) 0x100 + HLIST_POISON_POINTER_DELTA)
 #define HLIST_POISON2  ((void *) 0x200 + HLIST_POISON_POINTER_DELTA)
diff --git a/headers/linux/netfilter.h b/headers/linux/netfilter.h
new file mode 100644
index 00000000..604d945e
--- /dev/null
+++ b/headers/linux/netfilter.h
@@ -0,0 +1,114 @@
+#ifndef _LINUX_NETFILTER_H
+#define _LINUX_NETFILTER_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+#include <xdp/parsing_helpers.h>
+
+#include "hlist.h"
+
+struct flow_ports {
+	__be16 source, dest;
+};
+
+enum ip_conntrack_dir {
+	IP_CT_DIR_ORIGINAL,
+	IP_CT_DIR_REPLY,
+	IP_CT_DIR_MAX
+};
+
+enum flow_offload_tuple_dir {
+	FLOW_OFFLOAD_DIR_ORIGINAL	= IP_CT_DIR_ORIGINAL,
+	FLOW_OFFLOAD_DIR_REPLY		= IP_CT_DIR_REPLY,
+	FLOW_OFFLOAD_DIR_MAX		= IP_CT_DIR_MAX,
+};
+
+enum flow_offload_type {
+	NF_FLOW_OFFLOAD_UNSPEC,
+	NF_FLOW_OFFLOAD_ROUTE,
+};
+
+enum nf_flow_flags {
+	NF_FLOW_SNAT,
+	NF_FLOW_DNAT,
+	NF_FLOW_TEARDOWN,
+	NF_FLOW_HW,
+	NF_FLOW_HW_DYING,
+	NF_FLOW_HW_DEAD,
+	NF_FLOW_HW_PENDING,
+	NF_FLOW_HW_BIDIRECTIONAL,
+	NF_FLOW_HW_ESTABLISHED,
+};
+
+enum flow_offload_xmit_type {
+	FLOW_OFFLOAD_XMIT_UNSPEC,
+	FLOW_OFFLOAD_XMIT_NEIGH,
+	FLOW_OFFLOAD_XMIT_XFRM,
+	FLOW_OFFLOAD_XMIT_DIRECT,
+	FLOW_OFFLOAD_XMIT_TC,
+};
+
+#define NF_FLOW_TABLE_ENCAP_MAX		2
+struct flow_offload_tuple {
+	union {
+		struct in_addr		src_v4;
+		struct in6_addr		src_v6;
+	};
+	union {
+		struct in_addr		dst_v4;
+		struct in6_addr		dst_v6;
+	};
+	struct {
+		__be16			src_port;
+		__be16			dst_port;
+	};
+
+	int				iifidx;
+
+	__u8				l3proto;
+	__u8				l4proto;
+	struct {
+		__u16			id;
+		__be16			proto;
+	} encap[NF_FLOW_TABLE_ENCAP_MAX];
+
+	/* All members above are keys for lookups, see flow_offload_hash(). */
+	struct { }			__hash;
+
+	__u8				dir:2,
+					xmit_type:3,
+					encap_num:2,
+					in_vlan_ingress:2;
+	__u16				mtu;
+	union {
+		struct {
+			struct dst_entry *dst_cache;
+			__u32		dst_cookie;
+		};
+		struct {
+			__u32		ifidx;
+			__u32		hw_ifidx;
+			__u8		h_source[ETH_ALEN];
+			__u8		h_dest[ETH_ALEN];
+		} out;
+		struct {
+			__u32		iifidx;
+		} tc;
+	};
+};
+
+struct flow_offload_tuple_rhash {
+	struct rhash_head		node;
+	struct flow_offload_tuple	tuple;
+};
+
+struct flow_offload {
+	struct flow_offload_tuple_rhash		tuplehash[FLOW_OFFLOAD_DIR_MAX];
+	struct nf_conn				*ct;
+	unsigned long				flags;
+	__u16					type;
+	__u32					timeout;
+};
+
+#endif /* _LINUX_NETFILTER_H */
diff --git a/xdp-forward/Makefile b/xdp-forward/Makefile
index 51201772..17e8374e 100644
--- a/xdp-forward/Makefile
+++ b/xdp-forward/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-XDP_TARGETS := xdp_forward.bpf
+XDP_TARGETS := xdp_forward.bpf xdp_flowtable.bpf
 BPF_SKEL_TARGETS := $(XDP_TARGETS)
 
 XDP_OBJ_INSTALL :=
diff --git a/xdp-forward/xdp_flowtable.bpf.c b/xdp-forward/xdp_flowtable.bpf.c
new file mode 100644
index 00000000..32f627d7
--- /dev/null
+++ b/xdp-forward/xdp_flowtable.bpf.c
@@ -0,0 +1,609 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Original xdp_fwd sample Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ */
+
+#include <bpf/vmlinux.h>
+#include <linux/bpf.h>
+#include <linux/netfilter.h>
+#include <bpf/bpf_core_read.h>
+
+#define AF_INET				2
+#define AF_INET6			10
+
+#define IPV6_FLOWINFO_MASK              bpf_htons(0x0FFFFFFF)
+
+#define IP_MF				0x2000	/* "More Fragments" */
+#define IP_OFFSET			0x1fff	/* "Fragment Offset" */
+#define CSUM_MANGLED_0			((__sum16)0xffff)
+
+#define BIT(x)				(1 << (x))
+
+struct {
+	__uint(type, BPF_MAP_TYPE_DEVMAP);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+	__uint(max_entries, 64);
+} xdp_tx_ports SEC(".maps");
+
+struct bpf_flowtable_opts {
+	__s32 error;
+};
+
+struct flow_offload_tuple_rhash *
+bpf_xdp_flow_lookup(struct xdp_md *, struct bpf_fib_lookup *,
+		    struct bpf_flowtable_opts *, __u32) __ksym;
+
+/* from include/net/ip.h */
+static __always_inline int ip_decrease_ttl(struct iphdr *iph)
+{
+	__u32 check = (__u32)iph->check;
+
+	check += (__u32)bpf_htons(0x0100);
+	iph->check = (__sum16)(check + (check >= 0xFFFF));
+	return --iph->ttl;
+}
+
+static __always_inline __u32 csum_add(__u32 csum, __u32 addend)
+{
+	__u32 res = csum + addend;
+
+	return res + (res < addend);
+}
+
+static __always_inline __u16 csum_fold(__u32 csum)
+{
+	csum = (csum & 0xffff) + (csum >> 16);
+	csum = (csum & 0xffff) + (csum >> 16);
+	return ~csum;
+}
+
+static __always_inline __u16 csum_replace4(__u32 csum, __u32 from, __u32 to)
+{
+	__u32 tmp = csum_add(~csum, ~from);
+
+	return csum_fold(csum_add(tmp, to));
+}
+
+static __always_inline __u16 csum_replace16(__u32 csum, __u32 *from, __u32 *to)
+{
+	__u32 diff[] = {
+		~from[0], ~from[1], ~from[2], ~from[3],
+		to[0], to[1], to[2], to[3],
+	};
+
+	csum = bpf_csum_diff(0, 0, diff, sizeof(diff), ~csum);
+	return csum_fold(csum);
+}
+
+static __always_inline int
+xdp_flowtable_check_tcp_state(void *ports, void *data_end, __u8 proto)
+{
+	if (proto == IPPROTO_TCP) {
+		struct tcphdr *tcph = ports;
+
+		if (tcph + 1 > data_end)
+			return -1;
+
+		if (tcph->fin || tcph->rst)
+			return -1;
+	}
+
+	return 0;
+}
+
+static __always_inline void
+xdp_flowtable_update_port_csum(struct flow_ports *ports, void *data_end,
+			       __u8 proto, __be16 port, __be16 nat_port)
+{
+	switch (proto) {
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph = (struct tcphdr *)ports;
+
+		if (tcph + 1 > data_end)
+			break;
+
+		tcph->check = csum_replace4((__u32)tcph->check, (__u32)port,
+					    (__u32)nat_port);
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr *udph = (struct udphdr *)ports;
+
+		if (udph + 1 > data_end)
+			break;
+
+		if (!udph->check)
+			break;
+
+		udph->check = csum_replace4((__u32)udph->check, (__u32)port,
+					    (__u32)nat_port);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static __always_inline void
+xdp_flowtable_snat_port(const struct flow_offload *flow,
+			struct flow_ports *ports, void *data_end,
+			__u8 proto, enum flow_offload_tuple_dir dir)
+{
+	__be16 port, nat_port;
+
+	if (ports + 1 > data_end)
+		return;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		port = ports->source;
+		/* For original direction (FLOW_OFFLOAD_DIR_ORIGINAL):
+		 * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port contains
+		 *   the source port used for the traffic transmitted by the
+		 *   host.
+		 * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port contains
+		 *   the destination port used for the traffic transmitted by
+		 *   the host.
+		 */
+		bpf_core_read(&nat_port, bpf_core_type_size(nat_port),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port);
+		ports->source = nat_port;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		/* For reply direction (FLOW_OFFLOAD_DIR_REPLY):
+		 * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port
+		 *   contains source port used for the traffic received by the
+		 *   host.
+		 * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port
+		 *   contains the destination port used for the traffic
+		 *   received by the host.
+		 */
+		port = ports->dest;
+		bpf_core_read(&nat_port, bpf_core_type_size(nat_port),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port);
+		ports->dest = nat_port;
+		break;
+	default:
+		return;
+	}
+
+	xdp_flowtable_update_port_csum(ports, data_end, proto, port, nat_port);
+}
+
+static __always_inline void
+xdp_flowtable_dnat_port(const struct flow_offload *flow,
+			struct flow_ports *ports, void *data_end, __u8 proto,
+			enum flow_offload_tuple_dir dir)
+{
+	__be16 port, nat_port;
+
+	if (ports + 1 > data_end)
+		return;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		/* For original direction (FLOW_OFFLOAD_DIR_ORIGINAL):
+		 * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port contains
+		 *   the source port used for the traffic transmitted by the
+		 *   host.
+		 * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port contains
+		 *   the destination port used for the traffic transmitted by
+		 *   the host.
+		 */
+		port = ports->dest;
+		bpf_core_read(&nat_port, bpf_core_type_size(nat_port),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port);
+		ports->dest = nat_port;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		/* For reply direction (FLOW_OFFLOAD_DIR_REPLY):
+		 * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port
+		 *   contains the source port used for the traffic received by
+		 *   the host.
+		 * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port
+		 *   contains destination port used for the traffic received by
+		 *   the host.
+		 */
+		port = ports->source;
+		bpf_core_read(&nat_port, bpf_core_type_size(nat_port),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port);
+		ports->source = nat_port;
+		break;
+	default:
+		return;
+	}
+
+	xdp_flowtable_update_port_csum(ports, data_end, proto, port, nat_port);
+}
+
+static __always_inline void
+xdp_flowtable_update_ipv4_csum(struct iphdr *iph, void *data_end,
+			       __be32 addr, __be32 nat_addr)
+{
+	switch (iph->protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph = (struct tcphdr *)(iph + 1);
+
+		if (tcph + 1 > data_end)
+			break;
+
+		tcph->check = csum_replace4((__u32)tcph->check, addr,
+					    nat_addr);
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr *udph = (struct udphdr *)(iph + 1);
+
+		if (udph + 1 > data_end)
+			break;
+
+		if (!udph->check)
+			break;
+
+		udph->check = csum_replace4((__u32)udph->check, addr,
+					    nat_addr);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static __always_inline void
+xdp_flowtable_snat_ip(const struct flow_offload *flow, struct iphdr *iph,
+		      void *data_end, enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, nat_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->saddr;
+		bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr);
+		iph->saddr = nat_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->daddr;
+		bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr);
+		iph->daddr = nat_addr;
+		break;
+	default:
+		return;
+	}
+	iph->check = csum_replace4((__u32)iph->check, addr, nat_addr);
+
+	xdp_flowtable_update_ipv4_csum(iph, data_end, addr, nat_addr);
+}
+
+static __always_inline void
+xdp_flowtable_get_dnat_ip(__be32 *addr, const struct flow_offload *flow,
+			  enum flow_offload_tuple_dir dir)
+{
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		bpf_core_read(addr, sizeof(*addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr);
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		bpf_core_read(addr, sizeof(*addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr);
+		break;
+	default:
+		break;
+	}
+}
+
+static __always_inline void
+xdp_flowtable_dnat_ip(const struct flow_offload *flow, struct iphdr *iph,
+		      void *data_end, enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, nat_addr;
+
+	xdp_flowtable_get_dnat_ip(&nat_addr, flow, dir);
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->daddr;
+		iph->daddr = nat_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->saddr;
+		iph->saddr = nat_addr;
+		break;
+	default:
+		return;
+	}
+	iph->check = csum_replace4((__u32)iph->check, addr, nat_addr);
+
+	xdp_flowtable_update_ipv4_csum(iph, data_end, addr, nat_addr);
+}
+
+static __always_inline void
+xdp_flowtable_update_ipv6_csum(struct ipv6hdr *ip6h, void *data_end,
+			       struct in6_addr *addr,
+			       struct in6_addr *nat_addr)
+{
+	switch (ip6h->nexthdr) {
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph = (struct tcphdr *)(ip6h + 1);
+
+		if (tcph + 1 > data_end)
+			break;
+
+		tcph->check = csum_replace16((__u32)tcph->check,
+					     addr->in6_u.u6_addr32,
+					     nat_addr->in6_u.u6_addr32);
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr *udph = (struct udphdr *)(ip6h + 1);
+
+		if (udph + 1 > data_end)
+			break;
+
+		if (!udph->check)
+			break;
+
+		udph->check = csum_replace16((__u32)udph->check,
+					     addr->in6_u.u6_addr32,
+					     nat_addr->in6_u.u6_addr32);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static __always_inline void
+xdp_flowtable_snat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h,
+			void *data_end, enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, nat_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->saddr;
+		bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6);
+		ip6h->saddr = nat_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->daddr;
+		bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6);
+		ip6h->daddr = nat_addr;
+		break;
+	default:
+		return;
+	}
+
+	xdp_flowtable_update_ipv6_csum(ip6h, data_end, &addr, &nat_addr);
+}
+
+static __always_inline void
+xdp_flowtable_get_dnat_ipv6(struct in6_addr *addr,
+			    const struct flow_offload *flow,
+			    enum flow_offload_tuple_dir dir)
+{
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		bpf_core_read(addr, sizeof(*addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6);
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		bpf_core_read(addr, sizeof(*addr),
+			      &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6);
+		break;
+	default:
+		break;
+	}
+}
+
+static __always_inline void
+xdp_flowtable_dnat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h,
+			void *data_end, enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, nat_addr;
+
+	xdp_flowtable_get_dnat_ipv6(&nat_addr, flow, dir);
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->daddr;
+		ip6h->daddr = nat_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->saddr;
+		ip6h->saddr = nat_addr;
+		break;
+	default:
+		return;
+	}
+
+	xdp_flowtable_update_ipv6_csum(ip6h, data_end, &addr, &nat_addr);
+}
+
+static __always_inline void
+xdp_flowtable_forward_ip(const struct flow_offload *flow, void *data,
+			 void *data_end, struct flow_ports *ports,
+			 enum flow_offload_tuple_dir dir,
+			 unsigned long flags)
+{
+	struct iphdr *iph = data + sizeof(struct ethhdr);
+
+	if (iph + 1 > data_end)
+		return;
+
+	if (flags & BIT(NF_FLOW_SNAT)) {
+		xdp_flowtable_snat_port(flow, ports, data_end, iph->protocol,
+					dir);
+		xdp_flowtable_snat_ip(flow, iph, data_end, dir);
+	}
+	if (flags & BIT(NF_FLOW_DNAT)) {
+		xdp_flowtable_dnat_port(flow, ports, data_end, iph->protocol,
+					dir);
+		xdp_flowtable_dnat_ip(flow, iph, data_end, dir);
+	}
+
+	ip_decrease_ttl(iph);
+}
+
+static __always_inline void
+xdp_flowtable_forward_ipv6(const struct flow_offload *flow, void *data,
+			   void *data_end, struct flow_ports *ports,
+			   enum flow_offload_tuple_dir dir,
+			   unsigned long flags)
+{
+	struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
+
+	if (ip6h + 1 > data_end)
+		return;
+
+	if (flags & BIT(NF_FLOW_SNAT)) {
+		xdp_flowtable_snat_port(flow, ports, data_end, ip6h->nexthdr,
+					dir);
+		xdp_flowtable_snat_ipv6(flow, ip6h, data_end, dir);
+	}
+	if (flags & BIT(NF_FLOW_DNAT)) {
+		xdp_flowtable_dnat_port(flow, ports, data_end, ip6h->nexthdr,
+					dir);
+		xdp_flowtable_dnat_ipv6(flow, ip6h, data_end, dir);
+	}
+
+	ip6h->hop_limit--;
+}
+
+SEC("xdp")
+int xdp_fwd_flowtable(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct bpf_fib_lookup tuple = {
+		.ifindex = ctx->ingress_ifindex,
+	};
+	void *data = (void *)(long)ctx->data;
+	struct bpf_flowtable_opts opts = {};
+	enum flow_offload_tuple_dir dir;
+	struct ethhdr *eth = data;
+	struct flow_offload *flow;
+	struct flow_ports *ports;
+	unsigned long flags;
+
+	if (eth + 1 > data_end)
+		return XDP_PASS;
+
+	switch (eth->h_proto) {
+	case bpf_htons(ETH_P_IP): {
+		struct iphdr *iph = data + sizeof(*eth);
+
+		ports = (struct flow_ports *)(iph + 1);
+		if (ports + 1 > data_end)
+			return XDP_PASS;
+
+		/* ip fragmented traffic */
+		if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET))
+			return XDP_PASS;
+
+		/* ip options */
+		if (iph->ihl * 4 != sizeof(*iph))
+			return XDP_PASS;
+
+		if (iph->ttl <= 1)
+			return XDP_PASS;
+
+		if (xdp_flowtable_check_tcp_state(ports, data_end,
+						  iph->protocol) < 0)
+			return XDP_PASS;
+
+		tuple.family		= AF_INET;
+		tuple.tos		= iph->tos;
+		tuple.l4_protocol	= iph->protocol;
+		tuple.tot_len		= bpf_ntohs(iph->tot_len);
+		tuple.ipv4_src		= iph->saddr;
+		tuple.ipv4_dst		= iph->daddr;
+		tuple.sport		= ports->source;
+		tuple.dport		= ports->dest;
+		break;
+	}
+	case bpf_htons(ETH_P_IPV6): {
+		struct in6_addr *src = (struct in6_addr *)tuple.ipv6_src;
+		struct in6_addr *dst = (struct in6_addr *)tuple.ipv6_dst;
+		struct ipv6hdr *ip6h = data + sizeof(*eth);
+
+		ports = (struct flow_ports *)(ip6h + 1);
+		if (ports + 1 > data_end)
+			return XDP_PASS;
+
+		if (ip6h->hop_limit <= 1)
+			return XDP_PASS;
+
+		if (xdp_flowtable_check_tcp_state(ports, data_end,
+						  ip6h->nexthdr) < 0)
+			return XDP_PASS;
+
+		tuple.family		= AF_INET6;
+		tuple.l4_protocol	= ip6h->nexthdr;
+		tuple.tot_len		= bpf_ntohs(ip6h->payload_len);
+		*src			= ip6h->saddr;
+		*dst			= ip6h->daddr;
+		tuple.sport		= ports->source;
+		tuple.dport		= ports->dest;
+		break;
+	}
+	default:
+		return XDP_PASS;
+	}
+
+	tuplehash = bpf_xdp_flow_lookup(ctx, &tuple, &opts, sizeof(opts));
+	if (!tuplehash)
+		return XDP_PASS;
+
+	flow = container_of(tuplehash, struct flow_offload, tuplehash);
+	if (bpf_core_read(&flags, sizeof(flags), &flow->flags))
+		return XDP_PASS;
+
+	if (tuplehash->tuple.xmit_type != FLOW_OFFLOAD_XMIT_NEIGH)
+		return XDP_PASS;
+
+	dir = tuplehash->tuple.dir;
+	if (dir >= FLOW_OFFLOAD_DIR_MAX)
+		return XDP_PASS;
+
+	/* update the destination address in case of dnatting before
+	 * performing the route lookup
+	 */
+	if (tuple.family == AF_INET6) {
+		struct in6_addr *dst_addr = (struct in6_addr *)&tuple.ipv6_dst;
+
+		xdp_flowtable_get_dnat_ipv6(dst_addr, flow, dir);
+	} else {
+		xdp_flowtable_get_dnat_ip(&tuple.ipv4_dst, flow, dir);
+	}
+
+	if (bpf_fib_lookup(ctx, &tuple, sizeof(tuple), 0) !=
+	    BPF_FIB_LKUP_RET_SUCCESS)
+		return XDP_PASS;
+
+	/* Verify egress index has been configured as TX-port */
+	if (!bpf_map_lookup_elem(&xdp_tx_ports, &tuple.ifindex))
+		return XDP_PASS;
+
+	if (tuple.family == AF_INET6)
+		xdp_flowtable_forward_ipv6(flow, data, data_end, ports, dir,
+					   flags);
+	else
+		xdp_flowtable_forward_ip(flow, data, data_end, ports, dir,
+					 flags);
+
+	__builtin_memcpy(eth->h_dest, tuple.dmac, ETH_ALEN);
+	__builtin_memcpy(eth->h_source, tuple.smac, ETH_ALEN);
+
+	return bpf_redirect_map(&xdp_tx_ports, tuple.ifindex, 0);
+}
+
+char _license[] SEC("license") = "GPL";

From 30e9f7cff1ffbb462f03aba3fbcd1fe9ef45cebc Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 29 Jul 2024 17:36:19 +0200
Subject: [PATCH 2/5] xdp-forward: Add the capability to load xdp_fwd_flowtable
 sample from userspace

Introduce the capability to load xdp-fw-flowtable sample to offload in
xdp the processing of sw netfilter flowtable.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 xdp-forward/Makefile                   |  2 +-
 xdp-forward/README.org                 | 33 +++++++++++-
 xdp-forward/xdp-forward.8              | 39 ++++++++++++--
 xdp-forward/xdp-forward.c              | 71 +++++++++++++++++++++-----
 xdp-forward/xdp_flowtable_sample.bpf.c | 37 ++++++++++++++
 5 files changed, 164 insertions(+), 18 deletions(-)
 create mode 100644 xdp-forward/xdp_flowtable_sample.bpf.c

diff --git a/xdp-forward/Makefile b/xdp-forward/Makefile
index 17e8374e..17aa4317 100644
--- a/xdp-forward/Makefile
+++ b/xdp-forward/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-XDP_TARGETS := xdp_forward.bpf xdp_flowtable.bpf
+XDP_TARGETS := xdp_forward.bpf xdp_flowtable.bpf xdp_flowtable_sample.bpf
 BPF_SKEL_TARGETS := $(XDP_TARGETS)
 
 XDP_OBJ_INSTALL :=
diff --git a/xdp-forward/README.org b/xdp-forward/README.org
index d640e18c..bb2dfff8 100644
--- a/xdp-forward/README.org
+++ b/xdp-forward/README.org
@@ -14,8 +14,8 @@ xdp-forward is an XDP forwarding plane, which will accelerate packet forwarding
 using XDP. To use it, simply load it on the set of interfaces to accelerate
 forwarding between. The userspace component of xdp-forward will then configure
 and load XDP programs on those interfaces, and forward packets between them
-using XDP_REDIRECT, using the kernel routing table to determine the destination
-if each packet.
+using XDP_REDIRECT, using the kernel routing table or netfilter flowtable to
+determine the destination for each packet.
 
 Any packets that xdp-forward does not know how to forward will be passed up to
 the networking stack and handled by the kernel like normal. Depending on the
@@ -121,6 +121,35 @@ The =fib-direct= mode functions like =fib-full=, except it passes the
 policy routing rules configured will be skipped during the lookup, which can
 improve performance (but won't obey the policy of those rules, obviously).
 
+** flowtable
+The =flowtable= operating mode offloads netfilter sw flowtable logic in
+the XDP layer if the hardware flowtable is not available.
+At the moment =xdp-forward= is able to offload just TCP or UDP netfilter
+flowtable entries to XDP. The user is supposed to configure the flowtable
+separately.
+
+* Examples
+
+In order to enable flowtable offloading for tcp and udp traffic between NICs
+n0 and n1, issue the following commands:
+
+#+begin_src sh
+#nft -f /dev/stdin <<EOF
+table inet filter {
+    flowtable ft {
+        hook ingress priority filter
+        devices = { n0, n1 }
+    }
+    chain forward {
+        type filter hook forward priority filter
+        meta l4proto { tcp, udp } flow add @ft
+    }
+}
+EOF
+
+#xdp-forward load -f flowtable n0 n1
+#+end_src
+
 * SEE ALSO
 =libxdp(3)= for details on the XDP loading semantics and kernel compatibility
 requirements.
diff --git a/xdp-forward/xdp-forward.8 b/xdp-forward/xdp-forward.8
index ed9816b4..64ec2de6 100644
--- a/xdp-forward/xdp-forward.8
+++ b/xdp-forward/xdp-forward.8
@@ -1,4 +1,4 @@
-.TH "xdp-forward" "8" "JULY 30, 2024" "V1.4.2" "XDP program loader"
+.TH "xdp-forward" "8" "OCTOBER 10, 2024" "V1.4.3" "XDP program loader"
 
 .SH "NAME"
 xdp-forward \- the XDP forwarding plane
@@ -8,8 +8,8 @@ xdp-forward is an XDP forwarding plane, which will accelerate packet forwarding
 using XDP. To use it, simply load it on the set of interfaces to accelerate
 forwarding between. The userspace component of xdp-forward will then configure
 and load XDP programs on those interfaces, and forward packets between them
-using XDP_REDIRECT, using the kernel routing table to determine the destination
-if each packet.
+using XDP_REDIRECT, using the kernel routing table or netfilter flowtable to
+determine the destination for each packet.
 
 .PP
 Any packets that xdp-forward does not know how to forward will be passed up to
@@ -141,6 +141,39 @@ The \fIfib\-direct\fP mode functions like \fIfib\-full\fP, except it passes the
 policy routing rules configured will be skipped during the lookup, which can
 improve performance (but won't obey the policy of those rules, obviously).
 
+.SS "flowtable"
+.PP
+The \fIflowtable\fP operating mode offloads netfilter sw flowtable logic in
+the XDP layer if the hardware flowtable is not available.
+At the moment \fIxdp\-forward\fP is able to offload just TCP or UDP netfilter
+flowtable entries to XDP. The user is supposed to configure the flowtable
+separately.
+
+.SH "Examples"
+.PP
+In order to enable flowtable offloading for tcp and udp traffic between NICs
+n0 and n1, issue the following commands:
+
+.RS
+.nf
+\fC#nft -f /dev/stdin <<EOF
+table inet filter {
+    flowtable ft {
+        hook ingress priority filter
+        devices = { n0, n1 }
+    }
+    chain forward {
+        type filter hook forward priority filter
+        meta l4proto { tcp, udp } flow add @ft
+    }
+}
+EOF
+
+#xdp-forward load -f flowtable n0 n1
+\fP
+.fi
+.RE
+
 .SH "SEE ALSO"
 .PP
 \fIlibxdp(3)\fP for details on the XDP loading semantics and kernel compatibility
diff --git a/xdp-forward/xdp-forward.c b/xdp-forward/xdp-forward.c
index 2604914e..c44e5c66 100644
--- a/xdp-forward/xdp-forward.c
+++ b/xdp-forward/xdp-forward.c
@@ -11,6 +11,8 @@
 #include "compat.h"
 
 #include "xdp_forward.skel.h"
+#include "xdp_flowtable.skel.h"
+#include "xdp_flowtable_sample.skel.h"
 
 #define MAX_IFACE_NUM 32
 #define PROG_NAME "xdp-forward"
@@ -36,11 +38,13 @@ struct enum_val xdp_modes[] = { { "native", XDP_MODE_NATIVE },
 
 enum fwd_mode {
 	FWD_FIB_DIRECT,
-	FWD_FIB_FULL
+	FWD_FIB_FULL,
+	FWD_FLOWTABLE,
 };
 
 struct enum_val fwd_modes[] = { { "fib-direct", FWD_FIB_DIRECT },
 				{ "fib-full", FWD_FIB_FULL },
+				{ "flowtable", FWD_FLOWTABLE },
 				{ NULL, 0 } };
 
 static int find_prog(struct iface *iface, bool detach)
@@ -62,7 +66,8 @@ static int find_prog(struct iface *iface, bool detach)
 	while ((prog = xdp_multiprog__next_prog(prog, mp))) {
 	check:
 		if (!strcmp(xdp_program__name(prog), "xdp_fwd_fib_full") ||
-		    !strcmp(xdp_program__name(prog), "xdp_fwd_fib_direct")) {
+		    !strcmp(xdp_program__name(prog), "xdp_fwd_fib_direct") ||
+		    !strcmp(xdp_program__name(prog), "xdp_fwd_flowtable")) {
 			mode = xdp_multiprog__attach_mode(mp);
 			ret = 0;
 			if (detach) {
@@ -108,15 +113,29 @@ struct prog_option load_options[] = {
 	END_OPTIONS
 };
 
+static bool sample_probe_bpf_xdp_flow_lookup(void)
+{
+	struct xdp_flowtable_sample *skel;
+	bool res;
+
+	skel = xdp_flowtable_sample__open_and_load();
+	res = !!skel;
+	xdp_flowtable_sample__destroy(skel);
+
+	return res;
+}
+
 static int do_load(const void *cfg, __unused const char *pin_root_path)
 {
 	DECLARE_LIBBPF_OPTS(xdp_program_opts, opts);
 	struct xdp_program *xdp_prog = NULL;
 	const struct load_opts *opt = cfg;
 	struct bpf_program *prog = NULL;
-	struct xdp_forward *skel;
+	struct bpf_map *map = NULL;
+	struct bpf_object *obj;
 	int ret = EXIT_FAILURE;
 	struct iface *iface;
+	void *skel;
 
 	switch (opt->fwd_mode) {
 	case FWD_FIB_FULL:
@@ -125,23 +144,48 @@ static int do_load(const void *cfg, __unused const char *pin_root_path)
 	case FWD_FIB_DIRECT:
 		opts.prog_name = "xdp_fwd_fib_direct";
 		break;
+	case FWD_FLOWTABLE:
+		opts.prog_name = "xdp_fwd_flowtable";
+		break;
 	default:
 		goto end;
 	}
 
-	skel = xdp_forward__open();
-	if (!skel) {
-		pr_warn("Failed to load skeleton: %s\n", strerror(errno));
-		goto end;
+	if (opt->fwd_mode == FWD_FLOWTABLE) {
+		struct xdp_flowtable *xdp_flowtable_skel;
+
+		if (!sample_probe_bpf_xdp_flow_lookup()) {
+			pr_warn("The kernel does not support the bpf_xdp_flow_lookup() kfunc\n");
+			goto end;
+		}
+
+		xdp_flowtable_skel = xdp_flowtable__open();
+		if (!xdp_flowtable_skel) {
+			pr_warn("Failed to load skeleton: %s\n", strerror(errno));
+			goto end;
+		}
+		map = xdp_flowtable_skel->maps.xdp_tx_ports;
+		obj = xdp_flowtable_skel->obj;
+		skel = (void *)xdp_flowtable_skel;
+	} else {
+		struct xdp_forward *xdp_forward_skel = xdp_forward__open();
+
+		if (!xdp_forward_skel) {
+			pr_warn("Failed to load skeleton: %s\n", strerror(errno));
+			goto end;
+		}
+		map = xdp_forward_skel->maps.xdp_tx_ports;
+		obj = xdp_forward_skel->obj;
+		skel = (void *)xdp_forward_skel;
 	}
 
 	/* Make sure we only load the one XDP program we are interested in */
-	while ((prog = bpf_object__next_program(skel->obj, prog)) != NULL)
+	while ((prog = bpf_object__next_program(obj, prog)) != NULL)
 		if (bpf_program__type(prog) == BPF_PROG_TYPE_XDP &&
 		    bpf_program__expected_attach_type(prog) == BPF_XDP)
 			bpf_program__set_autoload(prog, false);
 
-	opts.obj = skel->obj;
+	opts.obj = obj;
 	xdp_prog = xdp_program__create(&opts);
 	if (!xdp_prog) {
 		ret = -errno;
@@ -177,8 +221,8 @@ static int do_load(const void *cfg, __unused const char *pin_root_path)
 			goto end_detach;
 		}
 
-		ret = bpf_map_update_elem(bpf_map__fd(skel->maps.xdp_tx_ports),
-					  &iface->ifindex, &iface->ifindex, 0);
+		ret = bpf_map_update_elem(bpf_map__fd(map), &iface->ifindex,
+					  &iface->ifindex, 0);
 		if (ret) {
 			pr_warn("Failed to update devmap value: %s\n",
 				strerror(errno));
@@ -188,7 +232,10 @@ static int do_load(const void *cfg, __unused const char *pin_root_path)
 	}
 
 end_destroy:
-	xdp_forward__destroy(skel);
+	if (opt->fwd_mode == FWD_FLOWTABLE)
+		xdp_flowtable__destroy(skel);
+	else
+		xdp_forward__destroy(skel);
 end:
 	return ret;
 
diff --git a/xdp-forward/xdp_flowtable_sample.bpf.c b/xdp-forward/xdp_flowtable_sample.bpf.c
new file mode 100644
index 00000000..185120a2
--- /dev/null
+++ b/xdp-forward/xdp_flowtable_sample.bpf.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Original xdp_fwd sample Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ */
+
+#include <bpf/vmlinux.h>
+#include <linux/bpf.h>
+#include <linux/netfilter.h>
+#include <bpf/bpf_core_read.h>
+
+#define AF_INET		2
+
+struct bpf_flowtable_opts {
+	__s32 error;
+};
+
+struct flow_offload_tuple_rhash *
+bpf_xdp_flow_lookup(struct xdp_md *, struct bpf_fib_lookup *,
+		    struct bpf_flowtable_opts *, __u32) __ksym;
+
+SEC("xdp")
+int xdp_fwd_flowtable_sample(struct xdp_md *ctx)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct bpf_flowtable_opts opts = {};
+	struct bpf_fib_lookup tuple = {
+		.family = AF_INET,
+		.ifindex = ctx->ingress_ifindex,
+	};
+
+	tuplehash = bpf_xdp_flow_lookup(ctx, &tuple, &opts, sizeof(opts));
+	if (!tuplehash)
+		return XDP_DROP;
+
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";

From 3d59f383a3ea8826f15bf37b080156b477114809 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Sat, 14 Sep 2024 19:19:36 +0200
Subject: [PATCH 3/5] xdp-forward: Add selftest for flowtable mode

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 .github/workflows/selftests.yml       |  2 +-
 lib/testing/test_runner.sh            |  2 +-
 xdp-forward/tests/test-xdp-forward.sh | 88 +++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/selftests.yml b/.github/workflows/selftests.yml
index 0b29d153..cda46f3d 100644
--- a/.github/workflows/selftests.yml
+++ b/.github/workflows/selftests.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Prepare packages
         run: |
           sudo apt-get update
-          sudo apt-get install zstd binutils-dev elfutils libpcap-dev libelf-dev gcc-multilib pkg-config wireshark tshark bpfcc-tools python3 python3-pip python3-setuptools qemu-kvm rpm2cpio libdw-dev libdwarf-dev libcap-ng-dev
+          sudo apt-get install zstd binutils-dev elfutils libpcap-dev libelf-dev gcc-multilib pkg-config wireshark tshark bpfcc-tools python3 python3-pip python3-setuptools qemu-kvm rpm2cpio libdw-dev libdwarf-dev libcap-ng-dev socat
       - name: Prepare Clang
         run: |
           wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
diff --git a/lib/testing/test_runner.sh b/lib/testing/test_runner.sh
index 8805e38f..90c7cda7 100755
--- a/lib/testing/test_runner.sh
+++ b/lib/testing/test_runner.sh
@@ -25,7 +25,7 @@ ALL_TESTS=""
 VERBOSE_TESTS=${V:-0}
 NUM_NS=2
 
-NEEDED_TOOLS="capinfos ethtool ip ping sed tc tcpdump timeout nc tshark"
+NEEDED_TOOLS="capinfos ethtool ip ping sed tc tcpdump timeout nc tshark nft socat"
 
 if [ -f "$TEST_CONFIG" ]; then
     source "$TEST_CONFIG"
diff --git a/xdp-forward/tests/test-xdp-forward.sh b/xdp-forward/tests/test-xdp-forward.sh
index 2a6988e8..f3a4b080 100644
--- a/xdp-forward/tests/test-xdp-forward.sh
+++ b/xdp-forward/tests/test-xdp-forward.sh
@@ -1,7 +1,6 @@
 XDP_LOADER=${XDP_LOADER:-./xdp-loader}
 XDP_FORWARD=${XDP_FORWARD:-./xdp-forward}
-ALL_TESTS="test_ping test_load test_fwd_full test_fwd_direct"
-
+ALL_TESTS="test_ping test_load test_fwd_full test_fwd_direct test_flowtable"
 
 test_ping()
 {
@@ -52,8 +51,89 @@ test_fwd_direct()
     check_run $XDP_FORWARD unload ${NS_NAMES[@]}
 }
 
+test_flowtable()
+{
+    local INPUT_FILE="${STATEDIR}/in_$$_$RANDOM"
+
+    # veth NAPI GRO support added this symbol; forwarding won't work without it
+    skip_if_missing_kernel_symbol veth_set_features
+
+    # disable {tx,rx} checksum offload since it is not currently suported
+    # by XDP_REDIRECT
+    for n in ${NS_NAMES[@]}; do
+        ip netns exec $n ethtool -K veth0 tx-checksumming off rx-checksumming off
+        ethtool -K $n tx-checksumming off rx-checksumming off
+    done
+
+    # create data to send via tcp
+    dd if=/dev/urandom of="${INPUT_FILE}" bs=8192 count=32 status=none
+
+    # create flowtable configuration in the main namespace
+    check_run nft -f /dev/stdin <<EOF
+table inet nat {
+    # enable DNAT to server <ip:port> in pre-routing chain
+    chain prerouting {
+        type nat hook prerouting priority filter; policy accept;
+        iifname == "${NS_NAMES[0]}" meta nfproto ipv4 tcp dport 12345 dnat ip to ${ALL_INSIDE_IP4[-1]}:10000
+        iifname == "${NS_NAMES[0]}" meta nfproto ipv6 tcp dport 12345 dnat ip6 to [${ALL_INSIDE_IP6[-1]}]:10000
+    }
+    # enable SNAT of the client ip via masquerading in post-routing chain
+    chain postrouting {
+        type nat hook postrouting priority filter; policy accept;
+        oifname "${NS_NAMES[-1]}" masquerade
+    }
+}
+table inet filter {
+    flowtable ft {
+        hook ingress priority filter
+        devices = { ${NS_NAMES[0]}, ${NS_NAMES[-1]} }
+    }
+    chain forward {
+        type filter hook forward priority filter
+        meta l4proto { tcp } flow add @ft
+    }
+}
+EOF
+
+    # check if bpf flowtable lookup is available
+    skip_if_missing_kernel_symbol bpf_xdp_flow_lookup
+
+    # Add some nft rules to check {dnat/snat} is done properly in
+    # the main namespace
+    check_run ip netns exec ${NS_NAMES[-1]} nft -f /dev/stdin <<EOF
+table inet filter {
+    chain input {
+        type filter hook input priority 0; policy drop
+        ip saddr $OUTSIDE_IP4 ip daddr ${ALL_INSIDE_IP4[-1]} tcp dport 10000 accept
+        ip6 saddr $OUTSIDE_IP6 ip6 daddr ${ALL_INSIDE_IP6[-1]} tcp dport 10000 accept
+    }
+}
+EOF
+    # wait a bit to configure nft
+    sleep 2
+
+    check_run $XDP_FORWARD load -f flowtable ${NS_NAMES[@]}
+
+    PID=$(start_background_ns_devnull "socat -4 TCP-LISTEN:10000,reuseaddr,fork -")
+    check_run ip netns exec ${NS_NAMES[0]} socat ${INPUT_FILE} TCP4:${OUTSIDE_IP4}:12345
+    stop_background $PID
+
+    PID=$(start_background_ns_devnull "socat -6 TCP-LISTEN:10000,reuseaddr,fork -")
+    check_run ip netns exec ${NS_NAMES[0]} socat ${INPUT_FILE} TCP6:[${OUTSIDE_IP6}]:12345
+    stop_background $PID
+}
+
 cleanup_tests()
 {
-    $XDP_FORWARD unload ${NS_NAMES[@]} >/dev/null 2>&1
-    $XDP_LOADER unload $NS --all >/dev/null 2>&1
+    # enable {tx,rx} checksum offload
+    for n in ${NS_NAMES[@]}; do
+        ip netns exec $n ethtool -K veth0 tx-checksumming on rx-checksumming on
+        ethtool -K $n tx-checksumming on rx-checksumming on
+    done >/dev/null 2>&1
+    {
+        $XDP_FORWARD unload ${NS_NAMES[@]}
+        $XDP_LOADER unload $NS --all
+        check_run ip netns exec ${NS_NAMES[-1]} nft flush ruleset
+        check_run nft flush ruleset
+    } >/dev/null 2>&1
 }

From 9c35c0bd171328d2c256a704ae7a4767b136d6f4 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 2 Oct 2024 16:21:40 +0200
Subject: [PATCH 4/5] xdp-forward: splict fwd-mode in fib-mode and fwd-mode

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 xdp-forward/README.org                | 36 ++++++++++++++++-------
 xdp-forward/tests/test-xdp-forward.sh |  4 +--
 xdp-forward/xdp-forward.8             | 41 +++++++++++++++++++--------
 xdp-forward/xdp-forward.c             | 39 ++++++++++++++++---------
 xdp-forward/xdp_flowtable.bpf.c       | 18 ++++++++++--
 5 files changed, 97 insertions(+), 41 deletions(-)

diff --git a/xdp-forward/README.org b/xdp-forward/README.org
index bb2dfff8..ab30906b 100644
--- a/xdp-forward/README.org
+++ b/xdp-forward/README.org
@@ -56,7 +56,11 @@ Specifies which forwarding mode =xdp-forward= should operate in. Depending on
 the mode selected, =xdp-forward= will perform forwarding in different ways,
 which can lead to different behaviour, including which subset of kernel
 configuration (such as firewall rules) is respected during forwarding. See the
-section *OPERATING MODES* below for a full description of each mode.
+section *FORWARDING MODES* below for a full description of each mode.
+
+** -F, --fib-mode <mode>
+Specifies how =xdp-forward= performs routing table lookup in the linux kernel.
+See the section *FIB MODES* below for a full description of each mode.
 
 ** -m, --mode <mode>
 Specifies which mode to load the XDP program to be loaded in. The valid values
@@ -98,12 +102,12 @@ Enable debug logging. Specify twice for even more verbosity.
 ** -h, --help
 Display a summary of the available options
 
-* OPERATING MODES
-The =xdp-forward= utility supports the following operating modes (selected by
+* FORWARDING MODES
+The =xdp-forward= utility supports the following forwarding modes (selected by
 the =--fwd-mode= parameter to =xdp-forward load=.
 
-** fib-full (default)
-In the =fib-full= operating mode, =xdp-forward= will perform a full lookup in
+** fib (default)
+In the =fib= forwarding mode, =xdp-forward= will perform a lookup in
 the kernel routing table (or FIB) for each packet, and forward packets between
 the configured interfaces based on the result of the lookup. Any packet where
 the lookup fails will be passed up to the stack. This includes packets that
@@ -115,12 +119,6 @@ Note that no checks other than the FIB lookup is performed; in particular, this
 completely bypasses the netfilter subsystem, so firewall rules will not be
 checked before forwarding.
 
-** fib-direct
-The =fib-direct= mode functions like =fib-full=, except it passes the
-=BPF_FIB_LOOKUP_DIRECT= flag to the FIB lookup routine. This means that any
-policy routing rules configured will be skipped during the lookup, which can
-improve performance (but won't obey the policy of those rules, obviously).
-
 ** flowtable
 The =flowtable= operating mode offloads netfilter sw flowtable logic in
 the XDP layer if the hardware flowtable is not available.
@@ -128,6 +126,22 @@ At the moment =xdp-forward= is able to offload just TCP or UDP netfilter
 flowtable entries to XDP. The user is supposed to configure the flowtable
 separately.
 
+* FIB MODES
+The =xdp-forward= utility supports the following fib modes (selected by
+the =--fib-mode= parameter to =xdp-forward load=.
+
+** full (default)
+In the =full= operating mode, =xdp-forward= will perform a full lookup in
+the kernel routing table (or FIB) for each packet, and forward packets between
+the configured interfaces based on the result of the lookup. In particular,
+it will apply any policy routing rules configured by the user.
+
+** direct
+The =direct= mode functions like =full=, except it passes the
+=BPF_FIB_LOOKUP_DIRECT= flag to the FIB lookup routine. This means that any
+policy routing rules configured will be skipped during the lookup, which can
+improve performance (but won't obey the policy of those rules, obviously).
+
 * Examples
 
 In order to enable flowtable offloading for tcp and udp traffic between NICs
diff --git a/xdp-forward/tests/test-xdp-forward.sh b/xdp-forward/tests/test-xdp-forward.sh
index f3a4b080..68fe0a0a 100644
--- a/xdp-forward/tests/test-xdp-forward.sh
+++ b/xdp-forward/tests/test-xdp-forward.sh
@@ -26,7 +26,7 @@ test_fwd_full()
     # veth NAPI GRO support added this symbol; forwarding won't work without it
     skip_if_missing_kernel_symbol veth_set_features
 
-    check_run $XDP_FORWARD load -f fib-full ${NS_NAMES[@]}
+    check_run $XDP_FORWARD load -f fib -F full ${NS_NAMES[@]}
     for ip in "${ALL_INSIDE_IP4[@]}"; do
         check_run ns_exec ping -c 1 -W 2 $ip
     done
@@ -41,7 +41,7 @@ test_fwd_direct()
     # veth NAPI GRO support added this symbol; forwarding won't work without it
     skip_if_missing_kernel_symbol veth_set_features
 
-    check_run $XDP_FORWARD load -f fib-direct ${NS_NAMES[@]}
+    check_run $XDP_FORWARD load -f fib -F direct ${NS_NAMES[@]}
     for ip in "${ALL_INSIDE_IP4[@]}"; do
         check_run ns_exec ping -c 1 -W 2 $ip
     done
diff --git a/xdp-forward/xdp-forward.8 b/xdp-forward/xdp-forward.8
index 64ec2de6..3c8f7804 100644
--- a/xdp-forward/xdp-forward.8
+++ b/xdp-forward/xdp-forward.8
@@ -62,7 +62,12 @@ Specifies which forwarding mode \fIxdp\-forward\fP should operate in. Depending
 the mode selected, \fIxdp\-forward\fP will perform forwarding in different ways,
 which can lead to different behaviour, including which subset of kernel
 configuration (such as firewall rules) is respected during forwarding. See the
-section \fBOPERATING MODES\fP below for a full description of each mode.
+section \fBFORWARDING MODES\fP below for a full description of each mode.
+
+.SS "-F, --fib-mode <mode>"
+.PP
+Specifies how \fIxdp\-forward\fP performs routing table lookup in the linux kernel.
+See the section \fBFIB MODES\fP below for a full description of each mode.
 
 .SS "-m, --mode <mode>"
 .PP
@@ -114,14 +119,14 @@ Enable debug logging. Specify twice for even more verbosity.
 .PP
 Display a summary of the available options
 
-.SH "OPERATING MODES"
+.SH "FORWARDING MODES"
 .PP
-The \fIxdp\-forward\fP utility supports the following operating modes (selected by
+The \fIxdp\-forward\fP utility supports the following forwarding modes (selected by
 the \fI\-\-fwd\-mode\fP parameter to \fIxdp\-forward load\fP.
 
-.SS "fib-full (default)"
+.SS "fib (default)"
 .PP
-In the \fIfib\-full\fP operating mode, \fIxdp\-forward\fP will perform a full lookup in
+In the \fIfib\fP forwarding mode, \fIxdp\-forward\fP will perform a lookup in
 the kernel routing table (or FIB) for each packet, and forward packets between
 the configured interfaces based on the result of the lookup. Any packet where
 the lookup fails will be passed up to the stack. This includes packets that
@@ -134,13 +139,6 @@ Note that no checks other than the FIB lookup is performed; in particular, this
 completely bypasses the netfilter subsystem, so firewall rules will not be
 checked before forwarding.
 
-.SS "fib-direct"
-.PP
-The \fIfib\-direct\fP mode functions like \fIfib\-full\fP, except it passes the
-\fIBPF_FIB_LOOKUP_DIRECT\fP flag to the FIB lookup routine. This means that any
-policy routing rules configured will be skipped during the lookup, which can
-improve performance (but won't obey the policy of those rules, obviously).
-
 .SS "flowtable"
 .PP
 The \fIflowtable\fP operating mode offloads netfilter sw flowtable logic in
@@ -149,6 +147,25 @@ At the moment \fIxdp\-forward\fP is able to offload just TCP or UDP netfilter
 flowtable entries to XDP. The user is supposed to configure the flowtable
 separately.
 
+.SH "FIB MODES"
+.PP
+The \fIxdp\-forward\fP utility supports the following fib modes (selected by
+the \fI\-\-fib\-mode\fP parameter to \fIxdp\-forward load\fP.
+
+.SS "full (default)"
+.PP
+In the \fIfull\fP operating mode, \fIxdp\-forward\fP will perform a full lookup in
+the kernel routing table (or FIB) for each packet, and forward packets between
+the configured interfaces based on the result of the lookup. In particular,
+it will apply any policy routing rules configured by the user.
+
+.SS "direct"
+.PP
+The \fIdirect\fP mode functions like \fIfull\fP, except it passes the
+\fIBPF_FIB_LOOKUP_DIRECT\fP flag to the FIB lookup routine. This means that any
+policy routing rules configured will be skipped during the lookup, which can
+improve performance (but won't obey the policy of those rules, obviously).
+
 .SH "Examples"
 .PP
 In order to enable flowtable offloading for tcp and udp traffic between NICs
diff --git a/xdp-forward/xdp-forward.c b/xdp-forward/xdp-forward.c
index c44e5c66..5549e6f4 100644
--- a/xdp-forward/xdp-forward.c
+++ b/xdp-forward/xdp-forward.c
@@ -37,16 +37,23 @@ struct enum_val xdp_modes[] = { { "native", XDP_MODE_NATIVE },
 				{ NULL, 0 } };
 
 enum fwd_mode {
-	FWD_FIB_DIRECT,
-	FWD_FIB_FULL,
+	FWD_FIB,
 	FWD_FLOWTABLE,
 };
 
-struct enum_val fwd_modes[] = { { "fib-direct", FWD_FIB_DIRECT },
-				{ "fib-full", FWD_FIB_FULL },
+struct enum_val fwd_modes[] = { { "fib", FWD_FIB },
 				{ "flowtable", FWD_FLOWTABLE },
 				{ NULL, 0 } };
 
+enum fib_mode {
+	FIB_DIRECT,
+	FIB_FULL,
+};
+
+struct enum_val fib_modes[] = { { "direct", FIB_DIRECT },
+				{ "full", FIB_FULL },
+				{ NULL, 0 } };
+
 static int find_prog(struct iface *iface, bool detach)
 {
 	struct xdp_program *prog = NULL;
@@ -88,16 +95,22 @@ static int find_prog(struct iface *iface, bool detach)
 
 struct load_opts {
 	enum fwd_mode fwd_mode;
+	enum fib_mode fib_mode;
 	enum xdp_attach_mode xdp_mode;
 	struct iface *ifaces;
-} defaults_load = { .fwd_mode = FWD_FIB_FULL };
+} defaults_load = { .fwd_mode = FWD_FIB, .fib_mode = FIB_FULL, };
 
 struct prog_option load_options[] = {
 	DEFINE_OPTION("fwd-mode", OPT_ENUM, struct load_opts, fwd_mode,
 		      .short_opt = 'f',
 		      .typearg = fwd_modes,
-		      .metavar = "<mode>",
-		      .help = "Forward mode to run in; see man page. Default fib-full"),
+		      .metavar = "<fwd-mode>",
+		      .help = "Forward mode to run in; see man page. Default fib"),
+	DEFINE_OPTION("fib-mode", OPT_ENUM, struct load_opts, fib_mode,
+		      .short_opt = 'F',
+		      .typearg = fib_modes,
+		      .metavar = "<fib-mode>",
+		      .help = "Fib mode to run in; see man page. Default full"),
 	DEFINE_OPTION("xdp-mode", OPT_ENUM, struct load_opts, xdp_mode,
 		      .short_opt = 'm',
 		      .typearg = xdp_modes,
@@ -138,14 +151,14 @@ static int do_load(const void *cfg, __unused const char *pin_root_path)
 	void *skel;
 
 	switch (opt->fwd_mode) {
-	case FWD_FIB_FULL:
-		opts.prog_name = "xdp_fwd_fib_full";
-		break;
-	case FWD_FIB_DIRECT:
-		opts.prog_name = "xdp_fwd_fib_direct";
+	case FWD_FIB:
+		opts.prog_name = opt->fib_mode == FIB_DIRECT
+				 ? "xdp_fwd_fib_direct" : "xdp_fwd_fib_full";
 		break;
 	case FWD_FLOWTABLE:
-		opts.prog_name = "xdp_fwd_flowtable";
+		opts.prog_name = opt->fib_mode == FIB_DIRECT
+				 ? "xdp_fwd_flowtable_direct"
+				 : "xdp_fwd_flowtable_full";
 		break;
 	default:
 		goto end;
diff --git a/xdp-forward/xdp_flowtable.bpf.c b/xdp-forward/xdp_flowtable.bpf.c
index 32f627d7..192c50b6 100644
--- a/xdp-forward/xdp_flowtable.bpf.c
+++ b/xdp-forward/xdp_flowtable.bpf.c
@@ -478,8 +478,8 @@ xdp_flowtable_forward_ipv6(const struct flow_offload *flow, void *data,
 	ip6h->hop_limit--;
 }
 
-SEC("xdp")
-int xdp_fwd_flowtable(struct xdp_md *ctx)
+static __always_inline int xdp_flowtable_flags(struct xdp_md *ctx,
+					       __u32 fib_flags)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	struct flow_offload_tuple_rhash *tuplehash;
@@ -585,7 +585,7 @@ int xdp_fwd_flowtable(struct xdp_md *ctx)
 		xdp_flowtable_get_dnat_ip(&tuple.ipv4_dst, flow, dir);
 	}
 
-	if (bpf_fib_lookup(ctx, &tuple, sizeof(tuple), 0) !=
+	if (bpf_fib_lookup(ctx, &tuple, sizeof(tuple), fib_flags) !=
 	    BPF_FIB_LKUP_RET_SUCCESS)
 		return XDP_PASS;
 
@@ -606,4 +606,16 @@ int xdp_fwd_flowtable(struct xdp_md *ctx)
 	return bpf_redirect_map(&xdp_tx_ports, tuple.ifindex, 0);
 }
 
+SEC("xdp")
+int xdp_fwd_flowtable_full(struct xdp_md *ctx)
+{
+	return xdp_flowtable_flags(ctx, 0);
+}
+
+SEC("xdp")
+int xdp_fwd_flowtable_direct(struct xdp_md *ctx)
+{
+	return xdp_flowtable_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
 char _license[] SEC("license") = "GPL";

From fdf6536d06fac45805aeea71d98cfaea55fd598e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 11 Oct 2024 11:34:45 +0200
Subject: [PATCH 5/5] CI: add 6.12 kernel version

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 .github/workflows/selftests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/selftests.yml b/.github/workflows/selftests.yml
index cda46f3d..94b95920 100644
--- a/.github/workflows/selftests.yml
+++ b/.github/workflows/selftests.yml
@@ -12,6 +12,7 @@ jobs:
     strategy:
       matrix:
         KERNEL_VERSION:
+          - "6.12.0-0.rc2.24.fc42"
           - "6.10.1-200.fc40"
           - "6.6.14-200.fc39"
           - "6.1.9-200.fc37"