Skip to content
Snippets Groups Projects
ip6_output.c 51.7 KiB
Newer Older
// SPDX-License-Identifier: GPL-2.0-or-later
Linus Torvalds's avatar
Linus Torvalds committed
/*
 *	IPv6 output functions
 *	Linux INET6 implementation
Linus Torvalds's avatar
Linus Torvalds committed
 *
 *	Authors:
 *	Pedro Roque		<roque@di.fc.ul.pt>
Linus Torvalds's avatar
Linus Torvalds committed
 *
 *	Based on linux/net/ipv4/ip_output.c
 *
 *	Changes:
 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
 *				extension headers are implemented.
 *				route changes now work.
 *				ip6_forward does not confuse sniffers.
 *				etc.
 *
 *      H. von Brand    :       Added missing #include <linux/string.h>
 *	Imran Patel	:	frag id should be in NBO
Linus Torvalds's avatar
Linus Torvalds committed
 *      Kazunori MIYAZAWA @USAGI
 *			:       add ip6_append_data and related functions
 *				for datagram xmit
 */

#include <linux/errno.h>
Herbert Xu's avatar
Herbert Xu committed
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/in6.h>
#include <linux/tcp.h>
#include <linux/route.h>
#include <linux/module.h>
Linus Torvalds's avatar
Linus Torvalds committed

#include <linux/bpf-cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>

#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/protocol.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/rawv6.h>
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/checksum.h>
#include <linux/mroute6.h>
#include <net/l3mdev.h>
#include <net/lwtunnel.h>
Linus Torvalds's avatar
Linus Torvalds committed

static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
Eric Dumazet's avatar
Eric Dumazet committed
	struct dst_entry *dst = skb_dst(skb);
Linus Torvalds's avatar
Linus Torvalds committed
	struct net_device *dev = dst->dev;
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
	int delta = hh_len - skb_headroom(skb);
	const struct in6_addr *nexthop;
	struct neighbour *neigh;
Linus Torvalds's avatar
Linus Torvalds committed

	/* Be paranoid, rather than too clever. */
	if (unlikely(delta > 0) && dev->header_ops) {
		/* pskb_expand_head() might crash, if skb is shared */
		if (skb_shared(skb)) {
			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

			if (likely(nskb)) {
				if (skb->sk)
					skb_set_owner_w(nskb, skb->sk);
				consume_skb(skb);
			} else {
				kfree_skb(skb);
			}
			skb = nskb;
		}
		if (skb &&
		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
			kfree_skb(skb);
			skb = NULL;
		}
		if (!skb) {
			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
			return -ENOMEM;
		}
	}

	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
Eric Dumazet's avatar
Eric Dumazet committed
		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
Linus Torvalds's avatar
Linus Torvalds committed

		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
		    ((mroute6_is_socket(net, skb) &&
		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
					 &ipv6_hdr(skb)->saddr))) {
Linus Torvalds's avatar
Linus Torvalds committed
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);

			/* Do not check for IFF_ALLMULTI; multicast routing
			   is not supported in any case.
			 */
			if (newskb)
				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
					net, sk, newskb, NULL, newskb->dev,
Linus Torvalds's avatar
Linus Torvalds committed

			if (ipv6_hdr(skb)->hop_limit == 0) {
				IP6_INC_STATS(net, idev,
					      IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds's avatar
Linus Torvalds committed
				kfree_skb(skb);
				return 0;
			}
		}

		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);

		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
		    IPV6_ADDR_SCOPE_NODELOCAL &&
		    !(dev->flags & IFF_LOOPBACK)) {
			kfree_skb(skb);
			return 0;
		}
	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
		int res = lwtunnel_xmit(skb);

		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
			return res;
	}

	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
	if (unlikely(!neigh))
		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
	if (!IS_ERR(neigh)) {
		sock_confirm_neigh(skb, neigh);
		ret = neigh_output(neigh, skb, false);
		rcu_read_unlock_bh();
		return ret;
	}
	rcu_read_unlock_bh();
	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
static int
ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
				    struct sk_buff *skb, unsigned int mtu)
{
	struct sk_buff *segs, *nskb;
	netdev_features_t features;
	int ret = 0;

	/* Please see corresponding comment in ip_finish_output_gso
	 * describing the cases where GSO segment length exceeds the
	 * egress MTU.
	 */
	features = netif_skb_features(skb);
	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
	if (IS_ERR_OR_NULL(segs)) {
		kfree_skb(skb);
		return -ENOMEM;
	}

	consume_skb(skb);

	skb_list_walk_safe(segs, segs, nskb) {
		int err;

		skb_mark_not_on_list(segs);
		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
		if (err && ret == 0)
			ret = err;
	}

	return ret;
}

static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
	if (skb_dst(skb)->xfrm) {
		IP6CB(skb)->flags |= IP6SKB_REROUTED;
	mtu = ip6_skb_dst_mtu(skb);
	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);

	if ((skb->len > mtu && !skb_is_gso(skb)) ||
	    dst_allfrag(skb_dst(skb)) ||
	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
		return ip6_fragment(net, sk, skb, ip6_finish_output2);
		return ip6_finish_output2(net, sk, skb);
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	int ret;

	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
	switch (ret) {
	case NET_XMIT_SUCCESS:
		return __ip6_finish_output(net, sk, skb);
	case NET_XMIT_CN:
		return __ip6_finish_output(net, sk, skb) ? : ret;
	default:
		kfree_skb(skb);
		return ret;
	}
}

int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
Eric Dumazet's avatar
Eric Dumazet committed
	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
	skb->protocol = htons(ETH_P_IPV6);
	skb->dev = dev;

		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
			    net, sk, skb, indev, dev,
			    ip6_finish_output,
			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
{
	if (!np->autoflowlabel_set)
		return ip6_default_np_autolabel(net);
	else
		return np->autoflowlabel;
}

Linus Torvalds's avatar
Linus Torvalds committed
/*
 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 * Note : socket lock is not held for SYNACK packets, but might be modified
 * by calls to skb_set_owner_w() and ipv6_local_error(),
 * which are using proper atomic operations or spinlocks.
Linus Torvalds's avatar
Linus Torvalds committed
 */
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct net *net = sock_net(sk);
	const struct ipv6_pinfo *np = inet6_sk(sk);
	struct in6_addr *first_hop = &fl6->daddr;
Eric Dumazet's avatar
Eric Dumazet committed
	struct dst_entry *dst = skb_dst(skb);
Linus Torvalds's avatar
Linus Torvalds committed
	struct ipv6hdr *hdr;
	u8  proto = fl6->flowi6_proto;
Linus Torvalds's avatar
Linus Torvalds committed
	int seg_len = skb->len;
	int hlimit = -1;
Linus Torvalds's avatar
Linus Torvalds committed
	u32 mtu;

	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
	if (opt)
		head_room += opt->opt_nflen + opt->opt_flen;

	if (unlikely(skb_headroom(skb) < head_room)) {
		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
		if (!skb2) {
			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
				      IPSTATS_MIB_OUTDISCARDS);
			kfree_skb(skb);
			return -ENOBUFS;
Linus Torvalds's avatar
Linus Torvalds committed
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
		consume_skb(skb);
		skb = skb2;
	}

	if (opt) {
		seg_len += opt->opt_nflen + opt->opt_flen;

Linus Torvalds's avatar
Linus Torvalds committed
		if (opt->opt_flen)
			ipv6_push_frag_opts(skb, opt, &proto);
Linus Torvalds's avatar
Linus Torvalds committed
		if (opt->opt_nflen)
			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
					     &fl6->saddr);
	skb_push(skb, sizeof(struct ipv6hdr));
	skb_reset_network_header(skb);
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 *	Fill in the IPv6 header
	 */
Linus Torvalds's avatar
Linus Torvalds committed
		hlimit = np->hop_limit;
	if (hlimit < 0)
		hlimit = ip6_dst_hoplimit(dst);
Linus Torvalds's avatar
Linus Torvalds committed

	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
				ip6_autoflowlabel(net, np), fl6));
Linus Torvalds's avatar
Linus Torvalds committed
	hdr->payload_len = htons(seg_len);
	hdr->nexthdr = proto;
	hdr->hop_limit = hlimit;

	hdr->saddr = fl6->saddr;
	hdr->daddr = *first_hop;
Linus Torvalds's avatar
Linus Torvalds committed

	skb->priority = priority;
Linus Torvalds's avatar
Linus Torvalds committed
	mtu = dst_mtu(dst);
	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
Eric Dumazet's avatar
Eric Dumazet committed
		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
			      IPSTATS_MIB_OUT, skb->len);

		/* if egress device is enslaved to an L3 master device pass the
		 * skb to its handler for processing
		 */
		skb = l3mdev_ip6_out((struct sock *)sk, skb);
		if (unlikely(!skb))
			return 0;

		/* hooks should never assume socket lock is held.
		 * we promote our socket to non const
		 */
		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
			       net, (struct sock *)sk, skb, NULL, dst->dev,
Linus Torvalds's avatar
Linus Torvalds committed
	}

	skb->dev = dst->dev;
	/* ipv6_local_error() does not require socket lock,
	 * we promote our socket to non const
	 */
	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);

Eric Dumazet's avatar
Eric Dumazet committed
	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds's avatar
Linus Torvalds committed
	kfree_skb(skb);
	return -EMSGSIZE;
}
EXPORT_SYMBOL(ip6_xmit);

Linus Torvalds's avatar
Linus Torvalds committed
static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
{
	struct ip6_ra_chain *ra;
	struct sock *last = NULL;

	read_lock(&ip6_ra_lock);
	for (ra = ip6_ra_chain; ra; ra = ra->next) {
		struct sock *sk = ra->sk;
		if (sk && ra->sel == sel &&
		    (!sk->sk_bound_dev_if ||
		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
			struct ipv6_pinfo *np = inet6_sk(sk);

			if (np && np->rtalert_isolate &&
			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
				continue;
			}
Linus Torvalds's avatar
Linus Torvalds committed
			if (last) {
				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
				if (skb2)
					rawv6_rcv(last, skb2);
			}
			last = sk;
		}
	}

	if (last) {
		rawv6_rcv(last, skb);
		read_unlock(&ip6_ra_lock);
		return 1;
	}
	read_unlock(&ip6_ra_lock);
	return 0;
}

static int ip6_forward_proxy_check(struct sk_buff *skb)
{
	struct ipv6hdr *hdr = ipv6_hdr(skb);
	u8 nexthdr = hdr->nexthdr;
	int offset;

	if (ipv6_ext_hdr(nexthdr)) {
		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
		if (offset < 0)
			return 0;
	} else
		offset = sizeof(struct ipv6hdr);

	if (nexthdr == IPPROTO_ICMPV6) {
		struct icmp6hdr *icmp6;

		if (!pskb_may_pull(skb, (skb_network_header(skb) +
					 offset + 1 - skb->data)))
		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);

		switch (icmp6->icmp6_type) {
		case NDISC_ROUTER_SOLICITATION:
		case NDISC_ROUTER_ADVERTISEMENT:
		case NDISC_NEIGHBOUR_SOLICITATION:
		case NDISC_NEIGHBOUR_ADVERTISEMENT:
		case NDISC_REDIRECT:
			/* For reaction involving unicast neighbor discovery
			 * message destined to the proxied address, pass it to
			 * input function.
			 */
			return 1;
		default:
			break;
		}
	}

	/*
	 * The proxying router can't forward traffic sent to a link-local
	 * address, so signal the sender and discard the packet. This
	 * behavior is clarified by the MIPv6 specification.
	 */
	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
		dst_link_failure(skb);
		return -1;
	}

static inline int ip6_forward_finish(struct net *net, struct sock *sk,
				     struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct dst_entry *dst = skb_dst(skb);

	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);

#ifdef CONFIG_NET_SWITCHDEV
	if (skb->offload_l3_fwd_mark) {
		consume_skb(skb);
		return 0;
	}
#endif

	skb->tstamp = 0;
	return dst_output(net, sk, skb);
static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
		return true;

	if (skb->ignore_df)
	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
Linus Torvalds's avatar
Linus Torvalds committed
int ip6_forward(struct sk_buff *skb)
{
Eric Dumazet's avatar
Eric Dumazet committed
	struct dst_entry *dst = skb_dst(skb);
	struct ipv6hdr *hdr = ipv6_hdr(skb);
Linus Torvalds's avatar
Linus Torvalds committed
	struct inet6_skb_parm *opt = IP6CB(skb);
	struct net *net = dev_net(dst->dev);
	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
	if (net->ipv6.devconf_all->forwarding == 0)
Linus Torvalds's avatar
Linus Torvalds committed
		goto error;

	if (skb->pkt_type != PACKET_HOST)
		goto drop;

	if (!net->ipv6.devconf_all->disable_policy &&
	    !idev->cnf.disable_policy &&
	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
Linus Torvalds's avatar
Linus Torvalds committed
		goto drop;
	}

Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 *	We DO NOT make any processing on
	 *	RA packets, pushing them to user level AS IS
	 *	without ane WARRANTY that application will be able
	 *	to interpret them. The reason is that we
	 *	cannot make anything clever here.
	 *
	 *	We are not end-node, so that if packet contains
	 *	AH/ESP, we cannot make anything.
	 *	Defragmentation also would be mistake, RA packets
	 *	cannot be fragmented, because there is no warranty
	 *	that different fragments will go along one path. --ANK
	 */
	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
Linus Torvalds's avatar
Linus Torvalds committed
			return 0;
	}

	/*
	 *	check and decrement ttl
	 */
	if (hdr->hop_limit <= 1) {
		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
Linus Torvalds's avatar
Linus Torvalds committed

		kfree_skb(skb);
		return -ETIMEDOUT;
	}

	/* XXX: idev->cnf.proxy_ndp? */
	if (net->ipv6.devconf_all->proxy_ndp &&
	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
		int proxied = ip6_forward_proxy_check(skb);
		if (proxied > 0)
			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
Linus Torvalds's avatar
Linus Torvalds committed
	if (!xfrm6_route_forward(skb)) {
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
Linus Torvalds's avatar
Linus Torvalds committed
		goto drop;
	}
Eric Dumazet's avatar
Eric Dumazet committed
	dst = skb_dst(skb);
Linus Torvalds's avatar
Linus Torvalds committed

	/* IPv6 specs say nothing about it, but it is clear that we cannot
	   send redirects to source routed frames.
	   We don't send redirects to frames decapsulated from IPsec.
Linus Torvalds's avatar
Linus Torvalds committed
	 */
	if (IP6CB(skb)->iif == dst->dev->ifindex &&
	    opt->srcrt == 0 && !skb_sec_path(skb)) {
Linus Torvalds's avatar
Linus Torvalds committed
		struct in6_addr *target = NULL;
		struct inet_peer *peer;
Linus Torvalds's avatar
Linus Torvalds committed
		struct rt6_info *rt;

		/*
		 *	incoming and outgoing devices are the same
		 *	send a redirect.
		 */

		rt = (struct rt6_info *) dst;
		if (rt->rt6i_flags & RTF_GATEWAY)
			target = &rt->rt6i_gateway;
Linus Torvalds's avatar
Linus Torvalds committed
		else
			target = &hdr->daddr;

		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
Linus Torvalds's avatar
Linus Torvalds committed
		/* Limit redirects both by destination (here)
		   and by source (inside ndisc_send_redirect)
		 */
		if (inet_peer_xrlim_allow(peer, 1*HZ))
			ndisc_send_redirect(skb, target);
		if (peer)
			inet_putpeer(peer);
	} else {
		int addrtype = ipv6_addr_type(&hdr->saddr);

Linus Torvalds's avatar
Linus Torvalds committed
		/* This check is security critical. */
		if (addrtype == IPV6_ADDR_ANY ||
		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
			goto error;
		if (addrtype & IPV6_ADDR_LINKLOCAL) {
			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
				    ICMPV6_NOT_NEIGHBOUR, 0);
	if (mtu < IPV6_MIN_MTU)
		mtu = IPV6_MIN_MTU;

	if (ip6_pkt_too_big(skb, mtu)) {
Linus Torvalds's avatar
Linus Torvalds committed
		/* Again, force OUTPUT device used as source address */
		skb->dev = dst->dev;
		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
		__IP6_INC_STATS(net, ip6_dst_idev(dst),
				IPSTATS_MIB_FRAGFAILS);
Linus Torvalds's avatar
Linus Torvalds committed
		kfree_skb(skb);
		return -EMSGSIZE;
	}

	if (skb_cow(skb, dst->dev->hard_header_len)) {
		__IP6_INC_STATS(net, ip6_dst_idev(dst),
				IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds's avatar
Linus Torvalds committed
		goto drop;
	}

Linus Torvalds's avatar
Linus Torvalds committed

	/* Mangling hops number delayed to point after skb COW */
Linus Torvalds's avatar
Linus Torvalds committed
	hdr->hop_limit--;

	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
		       net, NULL, skb, skb->dev, dst->dev,
		       ip6_forward_finish);
Linus Torvalds's avatar
Linus Torvalds committed

error:
	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
Linus Torvalds's avatar
Linus Torvalds committed
drop:
	kfree_skb(skb);
	return -EINVAL;
}

static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
	to->pkt_type = from->pkt_type;
	to->priority = from->priority;
	to->protocol = from->protocol;
Eric Dumazet's avatar
Eric Dumazet committed
	skb_dst_drop(to);
	skb_dst_set(to, dst_clone(skb_dst(from)));
Linus Torvalds's avatar
Linus Torvalds committed
	to->dev = from->dev;
	to->mark = from->mark;
Linus Torvalds's avatar
Linus Torvalds committed

	skb_copy_hash(to, from);

Linus Torvalds's avatar
Linus Torvalds committed
#ifdef CONFIG_NET_SCHED
	to->tc_index = from->tc_index;
#endif
	skb_ext_copy(to, from);
	skb_copy_secmark(to, from);
int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
		      u8 nexthdr, __be32 frag_id,
		      struct ip6_fraglist_iter *iter)
{
	unsigned int first_len;
	struct frag_hdr *fh;

	/* BUILD HEADER */
	*prevhdr = NEXTHDR_FRAGMENT;
	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
	if (!iter->tmp_hdr)
		return -ENOMEM;

	iter->frag = skb_shinfo(skb)->frag_list;
	skb_frag_list_init(skb);

	iter->offset = 0;
	iter->hlen = hlen;
	iter->frag_id = frag_id;
	iter->nexthdr = nexthdr;

	__skb_pull(skb, hlen);
	fh = __skb_push(skb, sizeof(struct frag_hdr));
	__skb_push(skb, hlen);
	skb_reset_network_header(skb);
	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);

	fh->nexthdr = nexthdr;
	fh->reserved = 0;
	fh->frag_off = htons(IP6_MF);
	fh->identification = frag_id;

	first_len = skb_pagelen(skb);
	skb->data_len = first_len - skb_headlen(skb);
	skb->len = first_len;
	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));

	return 0;
}
EXPORT_SYMBOL(ip6_fraglist_init);

void ip6_fraglist_prepare(struct sk_buff *skb,
			  struct ip6_fraglist_iter *iter)
{
	struct sk_buff *frag = iter->frag;
	unsigned int hlen = iter->hlen;
	struct frag_hdr *fh;

	frag->ip_summed = CHECKSUM_NONE;
	skb_reset_transport_header(frag);
	fh = __skb_push(frag, sizeof(struct frag_hdr));
	__skb_push(frag, hlen);
	skb_reset_network_header(frag);
	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
	fh->nexthdr = iter->nexthdr;
	fh->reserved = 0;
	fh->frag_off = htons(iter->offset);
	if (frag->next)
		fh->frag_off |= htons(IP6_MF);
	fh->identification = iter->frag_id;
	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
	ip6_copy_metadata(frag, skb);
}
EXPORT_SYMBOL(ip6_fraglist_prepare);

void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
{
	state->prevhdr = prevhdr;
	state->nexthdr = nexthdr;
	state->frag_id = frag_id;

	state->hlen = hlen;
	state->mtu = mtu;

	state->left = skb->len - hlen;	/* Space per frame */
	state->ptr = hlen;		/* Where to start from */

	state->hroom = hdr_room;
	state->troom = needed_tailroom;

	state->offset = 0;
}
EXPORT_SYMBOL(ip6_frag_init);

struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
{
	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
	struct sk_buff *frag;
	struct frag_hdr *fh;
	unsigned int len;

	len = state->left;
	/* IF: it doesn't fit, use 'mtu' - the data space left */
	if (len > state->mtu)
		len = state->mtu;
	/* IF: we are not sending up to and including the packet end
	   then align the next start on an eight byte boundary */
	if (len < state->left)
		len &= ~7;

	/* Allocate buffer */
	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
			 state->hroom + state->troom, GFP_ATOMIC);
	if (!frag)
		return ERR_PTR(-ENOMEM);

	/*
	 *	Set up data on packet
	 */

	ip6_copy_metadata(frag, skb);
	skb_reserve(frag, state->hroom);
	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
	skb_reset_network_header(frag);
	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
	frag->transport_header = (frag->network_header + state->hlen +
				  sizeof(struct frag_hdr));

	/*
	 *	Charge the memory for the fragment to any owner
	 *	it might possess
	 */
	if (skb->sk)
		skb_set_owner_w(frag, skb->sk);

	/*
	 *	Copy the packet header into the new buffer.
	 */
	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);

	fragnexthdr_offset = skb_network_header(frag);
	fragnexthdr_offset += prevhdr - skb_network_header(skb);
	*fragnexthdr_offset = NEXTHDR_FRAGMENT;

	/*
	 *	Build fragment header.
	 */
	fh->nexthdr = state->nexthdr;
	fh->reserved = 0;
	fh->identification = state->frag_id;

	/*
	 *	Copy a block of the IP datagram.
	 */
	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
			     len));
	state->left -= len;

	fh->frag_off = htons(state->offset);
	if (state->left > 0)
		fh->frag_off |= htons(IP6_MF);
	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));

	state->ptr += len;
	state->offset += len;

	return frag;
}
EXPORT_SYMBOL(ip6_frag_next);

int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
		 int (*output)(struct net *, struct sock *, struct sk_buff *))
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct sk_buff *frag;
	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
				inet6_sk(skb->sk) : NULL;
	struct ip6_frag_state state;
	unsigned int mtu, hlen, nexthdr_offset;
	ktime_t tstamp = skb->tstamp;
Linus Torvalds's avatar
Linus Torvalds committed
	u8 *prevhdr, nexthdr = 0;

	err = ip6_find_1stfragopt(skb, &prevhdr);
	if (err < 0)
Linus Torvalds's avatar
Linus Torvalds committed
	nexthdr = *prevhdr;
	nexthdr_offset = prevhdr - skb_network_header(skb);
Linus Torvalds's avatar
Linus Torvalds committed

	mtu = ip6_skb_dst_mtu(skb);

	/* We must not fragment if the socket is set to force MTU discovery
	 * or if the skb it not generated by a local socket.
	if (unlikely(!skb->ignore_df && skb->len > mtu))
		goto fail_toobig;
	if (IP6CB(skb)->frag_max_size) {
		if (IP6CB(skb)->frag_max_size > mtu)
			goto fail_toobig;

		/* don't send fragments larger than what we received */
		mtu = IP6CB(skb)->frag_max_size;
		if (mtu < IPV6_MIN_MTU)
			mtu = IPV6_MIN_MTU;
	if (np && np->frag_size < mtu) {
		if (np->frag_size)
			mtu = np->frag_size;
	}
	mtu -= hlen + sizeof(struct frag_hdr);
Linus Torvalds's avatar
Linus Torvalds committed

	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
				    &ipv6_hdr(skb)->saddr);
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (err = skb_checksum_help(skb)))
		goto fail;

	prevhdr = skb_network_header(skb) + nexthdr_offset;
	hroom = LL_RESERVED_SPACE(rt->dst.dev);
	if (skb_has_frag_list(skb)) {
		unsigned int first_len = skb_pagelen(skb);
		struct ip6_fraglist_iter iter;
		struct sk_buff *frag2;
Linus Torvalds's avatar
Linus Torvalds committed

		if (first_len - hlen > mtu ||
		    ((first_len - hlen) & 7) ||
		    skb_cloned(skb) ||
		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
Linus Torvalds's avatar
Linus Torvalds committed
			goto slow_path;

		skb_walk_frags(skb, frag) {
Linus Torvalds's avatar
Linus Torvalds committed
			/* Correct geometry. */
			if (frag->len > mtu ||
			    ((frag->len & 7) && frag->next) ||
			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
				goto slow_path_clean;
Linus Torvalds's avatar
Linus Torvalds committed

			/* Partially cloned skb? */
			if (skb_shared(frag))
				goto slow_path_clean;

			BUG_ON(frag->sk);
			if (skb->sk) {
				frag->sk = skb->sk;
				frag->destructor = sock_wfree;
			}
			skb->truesize -= frag->truesize;
		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
					&iter);
		if (err < 0)
Linus Torvalds's avatar
Linus Torvalds committed
		for (;;) {
			/* Prepare header of the next frame,
			 * before previous one went down. */
			if (iter.frag)
				ip6_fraglist_prepare(skb, &iter);
			err = output(net, sk, skb);
				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
					      IPSTATS_MIB_FRAGCREATES);
			if (err || !iter.frag)
Linus Torvalds's avatar
Linus Torvalds committed
				break;

			skb = ip6_fraglist_next(&iter);
		kfree(iter.tmp_hdr);
Linus Torvalds's avatar
Linus Torvalds committed

		if (err == 0) {
			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
				      IPSTATS_MIB_FRAGOKS);
Linus Torvalds's avatar
Linus Torvalds committed
			return 0;
		}

		kfree_skb_list(iter.frag);
Linus Torvalds's avatar
Linus Torvalds committed

		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
			      IPSTATS_MIB_FRAGFAILS);
Linus Torvalds's avatar
Linus Torvalds committed
		return err;

slow_path_clean:
		skb_walk_frags(skb, frag2) {
			if (frag2 == frag)
				break;
			frag2->sk = NULL;
			frag2->destructor = NULL;
			skb->truesize += frag2->truesize;
		}
Linus Torvalds's avatar
Linus Torvalds committed
	}

slow_path:
	/*
	 *	Fragment the datagram.
	 */

	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
		      &state);
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 *	Keep copying data until we run out.
	 */

	while (state.left > 0) {
		frag = ip6_frag_next(skb, &state);
		if (IS_ERR(frag)) {
			err = PTR_ERR(frag);
Linus Torvalds's avatar
Linus Torvalds committed
			goto fail;
		}

		/*
		 *	Put this fragment into the sending queue.
		 */
		err = output(net, sk, frag);
Linus Torvalds's avatar
Linus Torvalds committed
		if (err)
			goto fail;
Eric Dumazet's avatar
Eric Dumazet committed
		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
			      IPSTATS_MIB_FRAGCREATES);
Linus Torvalds's avatar
Linus Torvalds committed
	}
Eric Dumazet's avatar
Eric Dumazet committed
	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
		      IPSTATS_MIB_FRAGOKS);
	consume_skb(skb);
Linus Torvalds's avatar
Linus Torvalds committed
	return err;