diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 9e525646df1d9fa061df7048596d8399b77eff8e..15b2e3923c476bbd3a910a1aca49410a18fcd484 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -57,6 +57,7 @@ struct veth_rq_stats {
 
 struct veth_rq {
 	struct napi_struct	xdp_napi;
+	struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
 	struct net_device	*dev;
 	struct bpf_prog __rcu	*xdp_prog;
 	struct xdp_mem_info	xdp_mem;
@@ -293,13 +294,32 @@ static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
 		netif_rx(skb);
 }
 
+/* return true if the specified skb has chances of GRO aggregation
+ * Don't strive for accuracy, but try to avoid GRO overhead in the most
+ * common scenarios.
+ * When XDP is enabled, all traffic is considered eligible, as the xmit
+ * device has TSO off.
+ * When TSO is enabled on the xmit device, we are likely interested only
+ * in UDP aggregation, explicitly check for that if the skb is suspected
+ * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
+ * to belong to locally generated UDP traffic.
+ */
+static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
+					 const struct net_device *rcv,
+					 const struct sk_buff *skb)
+{
+	return !(dev->features & NETIF_F_ALL_TSO) ||
+		(skb->destructor == sock_wfree &&
+		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
+}
+
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
 	struct veth_rq *rq = NULL;
 	struct net_device *rcv;
 	int length = skb->len;
-	bool rcv_xdp = false;
+	bool use_napi = false;
 	int rxq;
 
 	rcu_read_lock();
@@ -313,20 +333,26 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	rxq = skb_get_queue_mapping(skb);
 	if (rxq < rcv->real_num_rx_queues) {
 		rq = &rcv_priv->rq[rxq];
-		rcv_xdp = rcu_access_pointer(rq->xdp_prog);
+
+		/* The napi pointer is available when an XDP program is
+		 * attached or when GRO is enabled
+		 * Don't bother with napi/GRO if the skb can't be aggregated
+		 */
+		use_napi = rcu_access_pointer(rq->napi) &&
+			   veth_skb_is_eligible_for_gro(dev, rcv, skb);
 		skb_record_rx_queue(skb, rxq);
 	}
 
 	skb_tx_timestamp(skb);
-	if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
-		if (!rcv_xdp)
+	if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
+		if (!use_napi)
 			dev_lstats_add(dev, length);
 	} else {
 drop:
 		atomic64_inc(&priv->dropped);
 	}
 
-	if (rcv_xdp)
+	if (use_napi)
 		__veth_xdp_flush(rq);
 
 	rcu_read_unlock();
@@ -686,7 +712,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 	int mac_len, delta, off;
 	struct xdp_buff xdp;
 
-	skb_orphan(skb);
+	skb_orphan_partial(skb);
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -903,7 +929,7 @@ static int veth_poll(struct napi_struct *napi, int budget)
 	return done;
 }
 
-static int veth_napi_add(struct net_device *dev)
+static int __veth_napi_enable(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	int err, i;
@@ -920,6 +946,7 @@ static int veth_napi_add(struct net_device *dev)
 		struct veth_rq *rq = &priv->rq[i];
 
 		napi_enable(&rq->xdp_napi);
+		rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
 	}
 
 	return 0;
@@ -938,6 +965,7 @@ static void veth_napi_del(struct net_device *dev)
 	for (i = 0; i < dev->real_num_rx_queues; i++) {
 		struct veth_rq *rq = &priv->rq[i];
 
+		rcu_assign_pointer(priv->rq[i].napi, NULL);
 		napi_disable(&rq->xdp_napi);
 		__netif_napi_del(&rq->xdp_napi);
 	}
@@ -951,8 +979,14 @@ static void veth_napi_del(struct net_device *dev)
 	}
 }
 
+static bool veth_gro_requested(const struct net_device *dev)
+{
+	return !!(dev->wanted_features & NETIF_F_GRO);
+}
+
 static int veth_enable_xdp(struct net_device *dev)
 {
+	bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
 	struct veth_priv *priv = netdev_priv(dev);
 	int err, i;
 
@@ -960,7 +994,8 @@ static int veth_enable_xdp(struct net_device *dev)
 		for (i = 0; i < dev->real_num_rx_queues; i++) {
 			struct veth_rq *rq = &priv->rq[i];
 
-			netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
+			if (!napi_already_on)
+				netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
 			err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
 			if (err < 0)
 				goto err_rxq_reg;
@@ -975,13 +1010,25 @@ static int veth_enable_xdp(struct net_device *dev)
 			rq->xdp_mem = rq->xdp_rxq.mem;
 		}
 
-		err = veth_napi_add(dev);
-		if (err)
-			goto err_rxq_reg;
+		if (!napi_already_on) {
+			err = __veth_napi_enable(dev);
+			if (err)
+				goto err_rxq_reg;
+
+			if (!veth_gro_requested(dev)) {
+				/* user-space did not require GRO, but adding XDP
+				 * is supposed to get GRO working
+				 */
+				dev->features |= NETIF_F_GRO;
+				netdev_features_change(dev);
+			}
+		}
 	}
 
-	for (i = 0; i < dev->real_num_rx_queues; i++)
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
 		rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
+		rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
+	}
 
 	return 0;
 err_reg_mem:
@@ -991,7 +1038,8 @@ err_rxq_reg:
 		struct veth_rq *rq = &priv->rq[i];
 
 		xdp_rxq_info_unreg(&rq->xdp_rxq);
-		netif_napi_del(&rq->xdp_napi);
+		if (!napi_already_on)
+			netif_napi_del(&rq->xdp_napi);
 	}
 
 	return err;
@@ -1004,7 +1052,19 @@ static void veth_disable_xdp(struct net_device *dev)
 
 	for (i = 0; i < dev->real_num_rx_queues; i++)
 		rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
-	veth_napi_del(dev);
+
+	if (!netif_running(dev) || !veth_gro_requested(dev)) {
+		veth_napi_del(dev);
+
+		/* if user-space did not require GRO, since adding XDP
+		 * enabled it, clear it now
+		 */
+		if (!veth_gro_requested(dev) && netif_running(dev)) {
+			dev->features &= ~NETIF_F_GRO;
+			netdev_features_change(dev);
+		}
+	}
+
 	for (i = 0; i < dev->real_num_rx_queues; i++) {
 		struct veth_rq *rq = &priv->rq[i];
 
@@ -1013,6 +1073,29 @@ static void veth_disable_xdp(struct net_device *dev)
 	}
 }
 
+static int veth_napi_enable(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	int err, i;
+
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_rq *rq = &priv->rq[i];
+
+		netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
+	}
+
+	err = __veth_napi_enable(dev);
+	if (err) {
+		for (i = 0; i < dev->real_num_rx_queues; i++) {
+			struct veth_rq *rq = &priv->rq[i];
+
+			netif_napi_del(&rq->xdp_napi);
+		}
+		return err;
+	}
+	return err;
+}
+
 static int veth_open(struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
@@ -1026,6 +1109,10 @@ static int veth_open(struct net_device *dev)
 		err = veth_enable_xdp(dev);
 		if (err)
 			return err;
+	} else if (veth_gro_requested(dev)) {
+		err = veth_napi_enable(dev);
+		if (err)
+			return err;
 	}
 
 	if (peer->flags & IFF_UP) {
@@ -1047,6 +1134,8 @@ static int veth_close(struct net_device *dev)
 
 	if (priv->_xdp_prog)
 		veth_disable_xdp(dev);
+	else if (veth_gro_requested(dev))
+		veth_napi_del(dev);
 
 	return 0;
 }
@@ -1145,10 +1234,32 @@ static netdev_features_t veth_fix_features(struct net_device *dev,
 		if (peer_priv->_xdp_prog)
 			features &= ~NETIF_F_GSO_SOFTWARE;
 	}
+	if (priv->_xdp_prog)
+		features |= NETIF_F_GRO;
 
 	return features;
 }
 
+static int veth_set_features(struct net_device *dev,
+			     netdev_features_t features)
+{
+	netdev_features_t changed = features ^ dev->features;
+	struct veth_priv *priv = netdev_priv(dev);
+	int err;
+
+	if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
+		return 0;
+
+	if (features & NETIF_F_GRO) {
+		err = veth_napi_enable(dev);
+		if (err)
+			return err;
+	} else {
+		veth_napi_del(dev);
+	}
+	return 0;
+}
+
 static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 {
 	struct veth_priv *peer_priv, *priv = netdev_priv(dev);
@@ -1267,6 +1378,7 @@ static const struct net_device_ops veth_netdev_ops = {
 #endif
 	.ndo_get_iflink		= veth_get_iflink,
 	.ndo_fix_features	= veth_fix_features,
+	.ndo_set_features	= veth_set_features,
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
 	.ndo_bpf		= veth_xdp,
@@ -1329,6 +1441,13 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
 
 static struct rtnl_link_ops veth_link_ops;
 
+static void veth_disable_gro(struct net_device *dev)
+{
+	dev->features &= ~NETIF_F_GRO;
+	dev->wanted_features &= ~NETIF_F_GRO;
+	netdev_update_features(dev);
+}
+
 static int veth_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[],
 			struct netlink_ext_ack *extack)
@@ -1401,6 +1520,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		goto err_register_peer;
 
+	/* keep GRO disabled by default to be consistent with the established
+	 * veth behavior
+	 */
+	veth_disable_gro(peer);
 	netif_carrier_off(peer);
 
 	err = rtnl_configure_link(peer, ifmp);
@@ -1438,6 +1561,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	priv = netdev_priv(peer);
 	rcu_assign_pointer(priv->peer, dev);
 
+	veth_disable_gro(dev);
 	return 0;
 
 err_register_dev:
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 2d71b283dde36335d5c6cca9cd59ef88539903ab..f4242a96108838db9d60d617e50b2ca79297afd4 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -24,6 +24,7 @@ TEST_PROGS += vrf_route_leaking.sh
 TEST_PROGS += bareudp.sh
 TEST_PROGS += unicast_extensions.sh
 TEST_PROGS += udpgro_fwd.sh
+TEST_PROGS += veth.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket nettest
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2fedc0781ce8c66a342d774889c9eaa87ec1a337
--- /dev/null
+++ b/tools/testing/selftests/net/veth.sh
@@ -0,0 +1,177 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+readonly STATS="$(mktemp -p /tmp ns-XXXXXX)"
+readonly BASE=`basename $STATS`
+readonly SRC=2
+readonly DST=1
+readonly DST_NAT=100
+readonly NS_SRC=$BASE$SRC
+readonly NS_DST=$BASE$DST
+
+# "baremetal" network used for raw UDP traffic
+readonly BM_NET_V4=192.168.1.
+readonly BM_NET_V6=2001:db8::
+
+readonly NPROCS=`nproc`
+ret=0
+
+cleanup() {
+	local ns
+	local -r jobs="$(jobs -p)"
+	[ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
+	rm -f $STATS
+
+	for ns in $NS_SRC $NS_DST; do
+		ip netns del $ns 2>/dev/null
+	done
+}
+
+trap cleanup EXIT
+
+create_ns() {
+	local ns
+
+	for ns in $NS_SRC $NS_DST; do
+		ip netns add $ns
+		ip -n $ns link set dev lo up
+	done
+
+	ip link add name veth$SRC type veth peer name veth$DST
+
+	for ns in $SRC $DST; do
+		ip link set dev veth$ns netns $BASE$ns up
+		ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
+		ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
+	done
+	echo "#kernel" > $BASE
+	chmod go-rw $BASE
+}
+
+__chk_flag() {
+	local msg="$1"
+	local target=$2
+	local expected=$3
+	local flagname=$4
+
+	local flag=`ip netns exec $BASE$target ethtool -k veth$target |\
+		    grep $flagname | awk '{print $2}'`
+
+	printf "%-60s" "$msg"
+	if [ "$flag" = "$expected" ]; then
+		echo " ok "
+	else
+		echo " fail - expected $expected found $flag"
+		ret=1
+	fi
+}
+
+chk_gro_flag() {
+	__chk_flag "$1" $2 $3 generic-receive-offload
+}
+
+chk_tso_flag() {
+	__chk_flag "$1" $2 $3 tcp-segmentation-offload
+}
+
+chk_gro() {
+	local msg="$1"
+	local expected=$2
+
+	ip netns exec $BASE$SRC ping -qc 1 $BM_NET_V4$DST >/dev/null
+	NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat -n
+
+	printf "%-60s" "$msg"
+	ip netns exec $BASE$DST ./udpgso_bench_rx -C 1000 -R 10 &
+	local spid=$!
+	sleep 0.1
+
+	ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 13000 -S 1300 -M 1 -D $BM_NET_V4$DST
+	local retc=$?
+	wait $spid
+	local rets=$?
+	if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+		echo " fail client exit code $retc, server $rets"
+		ret=1
+		return
+	fi
+
+	local pkts=`NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat IpInReceives | \
+		    awk '{print $2}' | tail -n 1`
+	if [ "$pkts" = "$expected" ]; then
+		echo " ok "
+	else
+		echo " fail - got $pkts packets, expected $expected "
+		ret=1
+	fi
+}
+
+if [ ! -f ../bpf/xdp_dummy.o ]; then
+	echo "Missing xdp_dummy helper. Build bpf selftest first"
+	exit -1
+fi
+
+create_ns
+chk_gro_flag "default - gro flag" $SRC off
+chk_gro_flag "        - peer gro flag" $DST off
+chk_tso_flag "        - tso flag" $SRC on
+chk_tso_flag "        - peer tso flag" $DST on
+chk_gro "        - aggregation" 1
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+chk_gro "        - aggregation with TSO off" 10
+cleanup
+
+create_ns
+ip netns exec $NS_DST ethtool -K veth$DST gro on
+chk_gro_flag "with gro on - gro flag" $DST on
+chk_gro_flag "        - peer gro flag" $SRC off
+chk_tso_flag "        - tso flag" $SRC on
+chk_tso_flag "        - peer tso flag" $DST on
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+chk_gro "        - aggregation with TSO off" 1
+cleanup
+
+create_ns
+ip -n $NS_DST link set dev veth$DST down
+ip netns exec $NS_DST ethtool -K veth$DST gro on
+chk_gro_flag "with gro enabled on link down - gro flag" $DST on
+chk_gro_flag "        - peer gro flag" $SRC off
+chk_tso_flag "        - tso flag" $SRC on
+chk_tso_flag "        - peer tso flag" $DST on
+ip -n $NS_DST link set dev veth$DST up
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+chk_gro "        - aggregation with TSO off" 1
+cleanup
+
+create_ns
+ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null
+chk_gro_flag "with xdp attached - gro flag" $DST on
+chk_gro_flag "        - peer gro flag" $SRC off
+chk_tso_flag "        - tso flag" $SRC off
+chk_tso_flag "        - peer tso flag" $DST on
+ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+chk_gro "        - aggregation" 1
+
+
+ip -n $NS_DST link set dev veth$DST down
+ip -n $NS_SRC link set dev veth$SRC down
+chk_gro_flag "        - after dev off, flag" $DST on
+chk_gro_flag "        - peer flag" $SRC off
+
+ip netns exec $NS_DST ethtool -K veth$DST gro on
+ip -n $NS_DST link set dev veth$DST xdp off
+chk_gro_flag "        - after gro on xdp off, gro flag" $DST on
+chk_gro_flag "        - peer gro flag" $SRC off
+chk_tso_flag "        - tso flag" $SRC on
+chk_tso_flag "        - peer tso flag" $DST on
+ip -n $NS_DST link set dev veth$DST up
+ip -n $NS_SRC link set dev veth$SRC up
+chk_gro "        - aggregation" 1
+
+ip netns exec $NS_DST ethtool -K veth$DST gro off
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+chk_gro "aggregation again with default and TSO off" 10
+
+exit $ret