patch-2.1.15 linux/net/ipv4/ip_output.c
Next file: linux/net/ipv4/ip_sockglue.c
Previous file: linux/net/ipv4/ip_options.c
Back to the patch index
Back to the overall index
- Lines: 1562
- Date:
Thu Dec 12 16:54:24 1996
- Orig file:
v2.1.14/linux/net/ipv4/ip_output.c
- Orig date:
Fri Nov 22 18:28:23 1996
diff -u --recursive --new-file v2.1.14/linux/net/ipv4/ip_output.c linux/net/ipv4/ip_output.c
@@ -26,6 +26,7 @@
* Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
* (in case if packet not accepted by
* output firewall rules)
+ * Alexey Kuznetsov: use new route cache
*/
#include <asm/uaccess.h>
@@ -65,225 +66,134 @@
#include <linux/mroute.h>
#include <net/netlink.h>
-/*
- * Loop a packet back to the sender.
- */
-
-static void ip_loopback(struct device *old_dev, struct sk_buff *skb)
+static void __inline__ ip_ll_header_reserve(struct sk_buff *skb)
{
- struct device *dev=&loopback_dev;
- int len=ntohs(skb->ip_hdr->tot_len);
- struct sk_buff *newskb=dev_alloc_skb(len+dev->hard_header_len+15);
-
- if(newskb==NULL)
- return;
-
- newskb->link3=NULL;
- newskb->sk=NULL;
- newskb->dev=dev;
- newskb->saddr=skb->saddr;
- newskb->daddr=skb->daddr;
- newskb->raddr=skb->raddr;
- newskb->free=1;
- newskb->lock=0;
- newskb->users=0;
- newskb->pkt_type=skb->pkt_type;
-
- /*
- * Put a MAC header on the packet
- */
- ip_send(NULL,newskb, skb->ip_hdr->daddr, len, dev, skb->ip_hdr->saddr);
- /*
- * Add the rest of the data space.
- */
- newskb->ip_hdr=(struct iphdr *)skb_put(newskb, len);
- memcpy(newskb->proto_priv, skb->proto_priv, sizeof(skb->proto_priv));
-
- /*
- * Copy the data
- */
- memcpy(newskb->ip_hdr,skb->ip_hdr,len);
-
- /* Recurse. The device check against IFF_LOOPBACK will stop infinite recursion */
-
- /*printk("Loopback output queued [%lX to %lX].\n", newskb->ip_hdr->saddr,newskb->ip_hdr->daddr);*/
- ip_queue_xmit(NULL, dev, newskb, 2);
+ struct rtable *rt = (struct rtable*)skb->dst;
+ skb_reserve(skb, (rt->u.dst.dev->hard_header_len+15)&~15);
+ ip_ll_header(skb);
}
+int ip_id_count = 0;
-/*
- * Take an skb, and fill in the MAC header.
- */
-
-int ip_send(struct rtable * rt, struct sk_buff *skb, __u32 daddr, int len, struct device *dev, __u32 saddr)
+int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr,
+ struct ip_options *opt)
{
- int mac = 0;
+ struct rtable *rt;
+ u32 final_daddr = daddr;
+ struct iphdr *iph;
+ int err;
+
+ if (opt && opt->srr)
+ daddr = opt->faddr;
- skb->dev = dev;
- skb->arp = 1;
- skb->protocol = htons(ETH_P_IP);
- if (dev->hard_header)
+ err = ip_route_output(&rt, daddr, saddr, RT_TOS(sk->ip_tos) |
+ (sk->localroute||0), NULL);
+ if (err)
{
- /*
- * Build a hardware header. Source address is our mac, destination unknown
- * (rebuild header will sort this out)
- */
- skb_reserve(skb,(dev->hard_header_len+15)&~15); /* 16 byte aligned IP headers are good */
- if (rt && dev == rt->rt_dev && rt->rt_hh)
- {
- memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
- if (rt->rt_hh->hh_uptodate)
- return dev->hard_header_len;
-#if RT_CACHE_DEBUG >= 2
- printk("ip_send: hh miss %08x via %08x\n", daddr, rt->rt_gateway);
-#endif
- skb->arp = 0;
- skb->raddr = daddr;
- return dev->hard_header_len;
- }
- mac = dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, len);
- if (mac < 0)
- {
- mac = -mac;
- skb->arp = 0;
- skb->raddr = daddr; /* next routing address */
- }
+ ip_statistics.IpOutNoRoutes++;
+ return err;
}
- return mac;
-}
-static int ip_send_room(struct rtable * rt, struct sk_buff *skb, __u32 daddr, int len, struct device *dev, __u32 saddr)
-{
- int mac = 0;
+ if (opt && opt->is_strictroute && rt->rt_flags&RTF_GATEWAY) {
+ ip_rt_put(rt);
+ ip_statistics.IpOutNoRoutes++;
+ return -ENETUNREACH;
+ }
- skb->dev = dev;
- skb->arp = 1;
- skb->protocol = htons(ETH_P_IP);
- skb_reserve(skb,MAX_HEADER);
- if (dev->hard_header)
- {
- if (rt && dev == rt->rt_dev && rt->rt_hh)
- {
- memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
- if (rt->rt_hh->hh_uptodate)
- return dev->hard_header_len;
-#if RT_CACHE_DEBUG >= 2
- printk("ip_send_room: hh miss %08x via %08x\n", daddr, rt->rt_gateway);
-#endif
- skb->arp = 0;
- skb->raddr = daddr;
- return dev->hard_header_len;
- }
- mac = dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, len);
- if (mac < 0)
- {
- mac = -mac;
- skb->arp = 0;
- skb->raddr = daddr; /* next routing address */
- }
+ skb->dst = dst_clone(&rt->u.dst);
+
+ skb->dev = rt->u.dst.dev;
+ skb->arp = 0;
+
+ ip_ll_header_reserve(skb);
+
+ /*
+ * Now build the IP header.
+ */
+
+ /*
+ * Build the IP addresses
+ */
+
+ if (opt)
+ iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr) + opt->optlen);
+ else
+ iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr));
+
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = sk->ip_tos;
+ iph->frag_off = 0;
+ if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
+ (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+ rt->rt_flags&RTF_NOPMTUDISC))
+ iph->frag_off |= htons(IP_DF);
+ iph->ttl = sk->ip_ttl;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->protocol;
+ skb->nh.iph = iph;
+ skb->h.raw = (unsigned char*)(iph+1);
+
+ if (opt && opt->optlen)
+ {
+ iph->ihl += opt->optlen>>2;
+ skb->h.raw += opt->optlen;
+ ip_options_build(skb, opt, final_daddr,
+ rt->u.dst.dev->pa_addr, 0);
}
- return mac;
+
+ ip_rt_put(rt);
+ return 0;
}
-int ip_id_count = 0;
-
/*
* This routine builds the appropriate hardware/IP headers for
- * the routine. It assumes that if *dev != NULL then the
- * protocol knows what it's doing, otherwise it uses the
- * routing/ARP tables to select a device struct.
+ * the routine.
*/
-int ip_build_header(struct sk_buff *skb, __u32 saddr, __u32 daddr,
- struct device **dev, int type, struct options *opt,
- int len, int tos, int ttl, struct rtable ** rp)
+int ip_build_header(struct sk_buff *skb, struct sock *sk)
{
struct rtable *rt;
- __u32 raddr;
- int tmp;
+ struct ip_options *opt = sk->opt;
+ u32 daddr = sk->daddr;
+ u32 final_daddr = daddr;
struct iphdr *iph;
- __u32 final_daddr = daddr;
-
+ int err;
if (opt && opt->srr)
daddr = opt->faddr;
- /*
- * See if we need to look up the device.
- */
-
-#ifdef CONFIG_IP_MULTICAST
- if(MULTICAST(daddr) && *dev==NULL && skb->sk && *skb->sk->ip_mc_name)
- *dev=dev_get(skb->sk->ip_mc_name);
-#endif
- if (rp)
- {
- rt = ip_check_route(rp, daddr, skb->localroute);
- /*
- * If rp != NULL rt_put following below should not
- * release route, so that...
- */
- if (rt)
- atomic_inc(&rt->rt_refcnt);
- }
- else
- rt = ip_rt_route(daddr, skb->localroute);
-
+ rt = (struct rtable*)sk->dst_cache;
- if (*dev == NULL)
- {
- if (rt == NULL)
- {
- ip_statistics.IpOutNoRoutes++;
- return(-ENETUNREACH);
- }
-
- *dev = rt->rt_dev;
+ if (!rt || rt->u.dst.obsolete) {
+ ip_rt_put(rt);
+ err = ip_route_output(&rt, daddr, sk->saddr, RT_TOS(sk->ip_tos) |
+ (sk->localroute||0), NULL);
+ if (err)
+ return err;
+ sk->dst_cache = &rt->u.dst;
}
- if ((LOOPBACK(saddr) && !LOOPBACK(daddr)) || !saddr)
- saddr = rt ? rt->rt_src : (*dev)->pa_addr;
-
- raddr = rt ? rt->rt_gateway : daddr;
-
- if (opt && opt->is_strictroute && rt && (rt->rt_flags & RTF_GATEWAY))
- {
+ if (opt && opt->is_strictroute && rt->rt_flags&RTF_GATEWAY) {
+ sk->dst_cache = NULL;
ip_rt_put(rt);
ip_statistics.IpOutNoRoutes++;
return -ENETUNREACH;
}
- /*
- * Now build the MAC header.
- */
-
- if (type==IPPROTO_TCP)
- tmp = ip_send_room(rt, skb, raddr, len, *dev, saddr);
- else
- tmp = ip_send(rt, skb, raddr, len, *dev, saddr);
-
- ip_rt_put(rt);
-
- /*
- * Book keeping
- */
+ skb->dst = dst_clone(sk->dst_cache);
- skb->dev = *dev;
- skb->saddr = saddr;
+ skb->dev = rt->u.dst.dev;
+ skb->arp = 0;
+ skb_reserve(skb, MAX_HEADER);
+ skb->mac.raw = skb->data;
/*
* Now build the IP header.
*/
/*
- * If we are using IPPROTO_RAW, then we don't need an IP header, since
- * one is being supplied to us by the user
- */
-
- if(type == IPPROTO_RAW)
- return (tmp);
-
- /*
* Build the IP addresses
*/
@@ -294,21 +204,118 @@
iph->version = 4;
iph->ihl = 5;
- iph->tos = tos;
+ iph->tos = sk->ip_tos;
iph->frag_off = 0;
- iph->ttl = ttl;
- iph->daddr = daddr;
- iph->saddr = saddr;
- iph->protocol = type;
- skb->ip_hdr = iph;
+ if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
+ (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+ rt->rt_flags&RTF_NOPMTUDISC))
+ iph->frag_off |= htons(IP_DF);
+ iph->ttl = sk->ip_ttl;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->protocol;
+ skb->nh.iph = iph;
+ skb->h.raw = (unsigned char*)(iph+1);
if (!opt || !opt->optlen)
- return sizeof(struct iphdr) + tmp;
+ return 0;
iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt, final_daddr, (*dev)->pa_addr, 0);
- return iph->ihl*4 + tmp;
+ skb->h.raw += opt->optlen;
+ ip_options_build(skb, opt, final_daddr, rt->u.dst.dev->pa_addr, 0);
+
+ return 0;
}
+int ip_mc_output(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct device *dev = rt->u.dst.dev;
+
+ /*
+ * If the indicated interface is up and running, send the packet.
+ */
+
+ ip_statistics.IpOutRequests++;
+#ifdef CONFIG_IP_ACCT
+ ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT);
+#endif
+
+ if (rt->rt_flags & RTCF_NAT)
+ ip_do_nat(skb);
+
+ /*
+ * Multicasts are looped back for other local users
+ */
+
+ if (rt->rt_flags&RTF_MULTICAST && !(dev->flags&IFF_LOOPBACK)) {
+ if (sk==NULL || sk->ip_mc_loop)
+ dev_loopback_xmit(skb);
+
+ /* Multicasts with ttl 0 must not go beyond the host */
+
+ if (skb->nh.iph->ttl == 0) {
+ kfree_skb(skb, FREE_WRITE);
+ return 0;
+ }
+ }
+
+ if ((rt->rt_flags&(RTF_LOCAL|RTF_BROADCAST)) == (RTF_LOCAL|RTF_BROADCAST) &&
+ !(dev->flags&IFF_LOOPBACK))
+ dev_loopback_xmit(skb);
+
+ if (dev->flags & IFF_UP) {
+ dev_queue_xmit(skb);
+ return 0;
+ }
+ ip_statistics.IpOutDiscards++;
+
+ kfree_skb(skb, FREE_WRITE);
+ return -ENETDOWN;
+}
+
+int ip_output(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct device *dev = rt->u.dst.dev;
+
+ /*
+ * If the indicated interface is up and running, send the packet.
+ */
+
+ ip_statistics.IpOutRequests++;
+
+#ifdef CONFIG_IP_ACCT
+ ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT);
+#endif
+
+ if (rt->rt_flags&RTCF_NAT)
+ ip_do_nat(skb);
+
+ if (dev->flags & IFF_UP) {
+ dev_queue_xmit(skb);
+ return 0;
+ }
+ ip_statistics.IpOutDiscards++;
+
+ kfree_skb(skb, FREE_WRITE);
+ return -ENETDOWN;
+}
+
+#ifdef CONFIG_IP_ACCT
+int ip_acct_output(struct sk_buff *skb)
+{
+ /*
+ * Count mapping we shortcut
+ */
+
+ ip_fw_chk(skb->nh.iph, skb->dev, NULL, ip_acct_chain, 0, IP_FW_MODE_ACCT_OUT);
+
+ dev_queue_xmit(skb);
+
+ return 0;
+}
+#endif
/*
* Generate a checksum for an outgoing IP datagram.
@@ -331,54 +338,48 @@
* and compute the checksum
*/
-void ip_queue_xmit(struct sock *sk, struct device *dev,
- struct sk_buff *skb, int free)
+void ip_queue_xmit(struct sk_buff *skb)
{
+ struct sock *sk = skb->sk;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct device *dev = rt->u.dst.dev;
unsigned int tot_len;
- struct iphdr *iph;
-
- IS_SKB(skb);
+ struct iphdr *iph = skb->nh.iph;
/*
- * Do some book-keeping in the packet for later
- */
-
- skb->sk = sk;
- skb->dev = dev;
- skb->when = jiffies;
-
- /*
- * Find the IP header and set the length. This is bad
- * but once we get the skb data handling code in the
- * hardware will push its header sensibly and we will
- * set skb->ip_hdr to avoid this mess and the fixed
- * header length problem
+ * Discard the surplus MAC header
*/
+
+ skb_pull(skb, skb->nh.raw - skb->data);
+ tot_len = skb->len;
- iph = skb->ip_hdr;
- tot_len = skb->len - (((unsigned char *)iph) - skb->data);
iph->tot_len = htons(tot_len);
+ iph->id = htons(ip_id_count++);
- switch (free) {
- /* No reassigning numbers to fragments... */
- case 2:
- free = 1;
- break;
- default:
- free = 1;
- iph->id = htons(ip_id_count++);
+#ifdef CONFIG_FIREWALL
+ if (call_out_firewall(PF_INET, dev, iph, NULL) < FW_ACCEPT) {
+ kfree_skb(skb, FREE_WRITE);
+ return;
}
+#endif
- skb->free = free;
+ if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
+ struct sk_buff *skb2;
+ /* ANK: It is almost impossible, but
+ * if you loaded module device with hh_len > MAX_HEADER,
+ * and if a route changed to this device,
+ * and if (uh...) TCP had segments queued on this route...
+ */
+ skb2 = skb_realloc_headroom(skb, (dev->hard_header_len+15)&~15);
+ kfree_skb(skb, FREE_WRITE);
+ if (skb2 == NULL)
+ return;
+ skb = skb2;
+ iph = skb->nh.iph;
+ }
- /* Sanity check */
- if (dev == NULL)
- goto no_device;
+ ip_ll_header(skb);
-#ifdef CONFIG_FIREWALL
- if (call_out_firewall(PF_INET, skb->dev, iph, NULL) < FW_ACCEPT)
- goto out;
-#endif
/*
* Do we need to fragment. Again this is inefficient.
@@ -386,10 +387,8 @@
* bits of it.
*/
- if (tot_len > dev->mtu)
- {
+ if (tot_len > rt->u.dst.pmtu)
goto fragment;
- }
/*
* Add an IP checksum
@@ -397,101 +396,27 @@
ip_send_check(iph);
- /*
- * More debugging. You cannot queue a packet already on a list
- * Spot this and moan loudly.
- */
- if (skb->next != NULL)
- {
- NETDEBUG(printk("ip_queue_xmit: next != NULL\n"));
- skb_unlink(skb);
- }
-
- /*
- * If the indicated interface is up and running, send the packet.
- */
-
- ip_statistics.IpOutRequests++;
-#ifdef CONFIG_IP_ACCT
- ip_fw_chk(iph,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT);
-#endif
-
-#ifdef CONFIG_IP_MULTICAST
-
- /*
- * Multicasts are looped back for other local users
- */
-
- if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK))
- {
- if(sk==NULL || sk->ip_mc_loop)
- {
- if(iph->daddr==IGMP_ALL_HOSTS || (dev->flags&IFF_ALLMULTI))
- {
- ip_loopback(dev,skb);
- }
- else
- {
- struct ip_mc_list *imc=dev->ip_mc_list;
- while(imc!=NULL)
- {
- if(imc->multiaddr==iph->daddr)
- {
- ip_loopback(dev,skb);
- break;
- }
- imc=imc->next;
- }
- }
- }
- /* Multicasts with ttl 0 must not go beyond the host */
-
- if (iph->ttl==0)
- goto out;
- }
-#endif
- if ((dev->flags & IFF_BROADCAST) && !(dev->flags & IFF_LOOPBACK)
- && (iph->daddr==dev->pa_brdaddr || iph->daddr==0xFFFFFFFF))
- ip_loopback(dev,skb);
-
- if (dev->flags & IFF_UP)
- {
- /*
- * If we have an owner use its priority setting,
- * otherwise use NORMAL
- */
- int priority = SOPRI_NORMAL;
- if (sk)
- priority = sk->priority;
-
- dev_queue_xmit(skb, dev, priority);
- return;
- }
- if(sk)
- sk->err = ENETDOWN;
- ip_statistics.IpOutDiscards++;
-out:
- if (free)
- kfree_skb(skb, FREE_WRITE);
+ if (sk)
+ skb->priority = sk->priority;
+ skb->dst->output(skb);
return;
-no_device:
- NETDEBUG(printk("IP: ip_queue_xmit dev = NULL\n"));
- goto out;
-
fragment:
if ((iph->frag_off & htons(IP_DF)))
{
printk(KERN_DEBUG "sending pkt_too_big to self\n");
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dev->mtu), dev);
- goto out;
+ htonl(dev->mtu));
+
+ kfree_skb(skb, FREE_WRITE);
+ return;
}
- ip_fragment(sk,skb,dev,0);
- goto out;
+
+ ip_fragment(skb, 1, skb->dst->output);
}
+
/*
* Build and send a packet, with as little as one copy
*
@@ -514,167 +439,87 @@
int ip_build_xmit(struct sock *sk,
int getfrag (const void *,
- __u32,
char *,
unsigned int,
unsigned int),
- const void *frag,
- unsigned short int length,
- __u32 daddr,
- __u32 user_saddr,
- struct options * opt,
- int flags,
- int type,
- int noblock)
+ const void *frag,
+ unsigned short length,
+ struct ipcm_cookie *ipc,
+ struct rtable *rt,
+ int flags)
{
- struct rtable *rt;
unsigned int fraglen, maxfraglen, fragheaderlen;
+ int err;
int offset, mf;
- __u32 saddr;
unsigned short id;
struct iphdr *iph;
- __u32 raddr;
- struct device *dev = NULL;
- struct hh_cache * hh=NULL;
+ int hh_len = rt->u.dst.dev->hard_header_len;
int nfrags=0;
- __u32 true_daddr = daddr;
- int err;
-
- if (opt && opt->srr && !sk->ip_hdrincl)
- daddr = opt->faddr;
+ struct ip_options *opt = ipc->opt;
+ struct device *dev = rt->u.dst.dev;
+ int df = htons(IP_DF);
- ip_statistics.IpOutRequests++;
-
-#ifdef CONFIG_IP_MULTICAST
- if(MULTICAST(daddr) && *sk->ip_mc_name)
- {
- dev=dev_get(sk->ip_mc_name);
- if(!dev)
- return -ENODEV;
- rt=NULL;
- if (sk->saddr && (!LOOPBACK(sk->saddr) || LOOPBACK(daddr)))
- saddr = sk->saddr;
- else
- saddr = dev->pa_addr;
- }
- else
- {
-#endif
- rt = ip_check_route(&sk->ip_route_cache, daddr,
- sk->localroute || (flags&MSG_DONTROUTE) ||
- (opt && opt->is_strictroute));
- if (rt == NULL)
- {
- ip_statistics.IpOutNoRoutes++;
- return(-ENETUNREACH);
- }
- saddr = rt->rt_src;
- hh = rt->rt_hh;
-
- if (sk->saddr && (!LOOPBACK(sk->saddr) || LOOPBACK(daddr)))
- saddr = sk->saddr;
-
- dev=rt->rt_dev;
-#ifdef CONFIG_IP_MULTICAST
- }
- if (rt && !dev)
- dev = rt->rt_dev;
-#endif
- if (user_saddr)
- saddr = user_saddr;
+ if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
+ (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+ rt->rt_flags&RTF_NOPMTUDISC))
+ df = 0;
- raddr = rt ? rt->rt_gateway : daddr;
- /*
- * Now compute the buffer space we require
- */
/*
- * Try the simple case first. This leaves broadcast, multicast, fragmented frames, and by
+ * Try the simple case first. This leaves fragmented frames, and by
* choice RAW frames within 20 bytes of maximum size(rare) to the long path
*/
- if (!sk->ip_hdrincl) {
+ if (!sk->ip_hdrincl)
length += sizeof(struct iphdr);
- if(opt) length += opt->optlen;
- }
- if(length <= dev->mtu && !MULTICAST(daddr) && daddr!=0xFFFFFFFF && daddr!=dev->pa_brdaddr)
- {
+ if (length <= rt->u.dst.pmtu && opt == NULL) {
int error;
- struct sk_buff *skb=sock_alloc_send_skb(sk, length+15+dev->hard_header_len,0, noblock, &error);
- if(skb==NULL)
- {
+ struct sk_buff *skb=sock_alloc_send_skb(sk, length+15+hh_len,
+ 0, flags&MSG_DONTWAIT, &error);
+ if(skb==NULL) {
ip_statistics.IpOutDiscards++;
return error;
}
- skb->dev=dev;
- skb->protocol = htons(ETH_P_IP);
- skb->free=1;
+
skb->when=jiffies;
- skb->sk=sk;
- skb->arp=0;
- skb->saddr=saddr;
- skb->raddr = raddr;
- skb_reserve(skb,(dev->hard_header_len+15)&~15);
- if (hh)
- {
- skb->arp=1;
- memcpy(skb_push(skb,dev->hard_header_len),hh->hh_data,dev->hard_header_len);
- if (!hh->hh_uptodate)
- {
- skb->arp = 0;
-#if RT_CACHE_DEBUG >= 2
- printk("ip_build_xmit: hh miss %08x via %08x\n", rt->rt_dst, rt->rt_gateway);
-#endif
- }
- }
- else if(dev->hard_header)
- {
- if(dev->hard_header(skb,dev,ETH_P_IP,NULL,NULL,0)>0)
- skb->arp=1;
- }
- else
- skb->arp=1;
- skb->ip_hdr=iph=(struct iphdr *)skb_put(skb,length);
+ skb->priority = sk->priority;
+ skb->dst = dst_clone(&rt->u.dst);
+
+ ip_ll_header_reserve(skb);
+
+ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+
dev_lock_list();
- if(!sk->ip_hdrincl)
- {
+
+ if(!sk->ip_hdrincl) {
iph->version=4;
iph->ihl=5;
iph->tos=sk->ip_tos;
iph->tot_len = htons(length);
iph->id=htons(ip_id_count++);
- iph->frag_off = 0;
- iph->ttl=sk->ip_ttl;
- iph->protocol=type;
- iph->saddr=saddr;
- iph->daddr=daddr;
- if (opt)
- {
- iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt,
- true_daddr, dev->pa_addr, 0);
- }
+ iph->frag_off = df;
+ iph->ttl=sk->ip_mc_ttl;
+ if (!(rt->rt_flags&RTF_MULTICAST))
+ iph->ttl=sk->ip_ttl;
+ iph->protocol=sk->protocol;
+ iph->saddr=rt->rt_src;
+ iph->daddr=rt->rt_dst;
iph->check=0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
- err = getfrag(frag,saddr,((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
+ err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
}
else
- err = getfrag(frag, saddr, (void *)iph, 0, length);
-
+ err = getfrag(frag, (void *)iph, 0, length);
dev_unlock_list();
-
+
if (err)
- {
err = -EFAULT;
- }
#ifdef CONFIG_FIREWALL
- if(!err && call_out_firewall(PF_INET, skb->dev, iph, NULL)< FW_ACCEPT)
- {
- err = -EPERM;
- }
+ if(!err && call_out_firewall(PF_INET, skb->dev, iph, NULL) < FW_ACCEPT)
+ err = -EPERM;
#endif
if (err)
@@ -683,39 +528,26 @@
return err;
}
-#ifdef CONFIG_IP_ACCT
- ip_fw_chk(iph,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT);
-#endif
- if(dev->flags&IFF_UP)
- dev_queue_xmit(skb,dev,sk->priority);
- else
- {
- ip_statistics.IpOutDiscards++;
- kfree_skb(skb, FREE_WRITE);
- }
- return 0;
+ return rt->u.dst.output(skb);
}
+
if (!sk->ip_hdrincl)
length -= sizeof(struct iphdr);
-
- if(opt)
- {
- length -= opt->optlen;
- fragheaderlen = dev->hard_header_len + sizeof(struct iphdr) + opt->optlen;
- maxfraglen = ((dev->mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
- }
- else
- {
- fragheaderlen = dev->hard_header_len;
+
+ if (opt) {
+ fragheaderlen = hh_len + sizeof(struct iphdr) + opt->optlen;
+ maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
+ } else {
+ fragheaderlen = hh_len;
if(!sk->ip_hdrincl)
- fragheaderlen += 20;
+ fragheaderlen += sizeof(struct iphdr);
/*
- * Fragheaderlen is the size of 'overhead' on each buffer.
- * Now work out the size of the frames to send.
+ * Fragheaderlen is the size of 'overhead' on each buffer. Now work
+ * out the size of the frames to send.
*/
- maxfraglen = ((dev->mtu-20) & ~7) + fragheaderlen;
+ maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
}
/*
@@ -730,8 +562,7 @@
fraglen = length - offset + fragheaderlen;
- if(length-offset==0)
- {
+ if (length-offset==0) {
fraglen = maxfraglen;
offset -= maxfraglen-fragheaderlen;
}
@@ -747,7 +578,7 @@
* Can't fragment raw packets
*/
- if (sk->ip_hdrincl && offset > 0)
+ if (offset > 0 && df)
return(-EMSGSIZE);
/*
@@ -766,8 +597,7 @@
* Being outputting the bytes.
*/
- do
- {
+ do {
struct sk_buff * skb;
int error;
char *data;
@@ -776,12 +606,11 @@
* Get the memory we require with some space left for alignment.
*/
- skb = sock_alloc_send_skb(sk, fraglen+15, 0, noblock, &error);
- if (skb == NULL)
- {
+ skb = sock_alloc_send_skb(sk, fraglen+15, 0, flags&MSG_DONTWAIT, &error);
+ if (skb == NULL) {
ip_statistics.IpOutDiscards++;
if(nfrags>1)
- ip_statistics.IpFragCreates++;
+ ip_statistics.IpFragCreates++;
dev_unlock_list();
return(error);
}
@@ -790,81 +619,44 @@
* Fill in the control structures
*/
- skb->dev = dev;
- skb->protocol = htons(ETH_P_IP);
skb->when = jiffies;
- skb->free = 1; /* dubious, this one */
- skb->sk = sk;
- skb->arp = 0;
- skb->saddr = saddr;
- skb->daddr = daddr;
- skb->raddr = raddr;
- skb_reserve(skb,(dev->hard_header_len+15)&~15);
- data = skb_put(skb, fraglen-dev->hard_header_len);
+ skb->priority = sk->priority;
+ skb->dst = dst_clone(&rt->u.dst);
+
+ ip_ll_header_reserve(skb);
- /*
- * Save us ARP and stuff. In the optimal case we do no route lookup (route cache ok)
- * no ARP lookup (arp cache ok) and output. The cache checks are still too slow but
- * this can be fixed later. For gateway routes we ought to have a rt->.. header cache
- * pointer to speed header cache builds for identical targets.
- */
-
- if (hh)
- {
- skb->arp=1;
- memcpy(skb_push(skb,dev->hard_header_len),hh->hh_data,dev->hard_header_len);
- if (!hh->hh_uptodate)
- {
- skb->arp = 0;
-#if RT_CACHE_DEBUG >= 2
- printk("ip_build_xmit: hh miss %08x via %08x\n", rt->rt_dst, rt->rt_gateway);
-#endif
- }
- }
- else if (dev->hard_header)
- {
- if(dev->hard_header(skb, dev, ETH_P_IP,
- NULL, NULL, 0)>0)
- skb->arp=1;
- }
- else
- skb->arp = 1;
-
/*
* Find where to start putting bytes.
*/
- skb->ip_hdr = iph = (struct iphdr *)data;
+ data = skb_put(skb, fraglen-hh_len);
+ skb->nh.iph = iph = (struct iphdr *)data;
/*
* Only write IP header onto non-raw packets
*/
- if(!sk->ip_hdrincl)
- {
-
+ if(!sk->ip_hdrincl) {
iph->version = 4;
- iph->ihl = 5; /* ugh */
+ iph->ihl = 5;
if (opt) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt,
- true_daddr, dev->pa_addr, offset);
+ ipc->addr, dev->pa_addr, offset);
}
iph->tos = sk->ip_tos;
iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
iph->id = id;
iph->frag_off = htons(offset>>3);
- iph->frag_off |= mf;
-#ifdef CONFIG_IP_MULTICAST
- if (MULTICAST(daddr))
+ iph->frag_off |= mf|df;
+ if (rt->rt_flags&RTF_MULTICAST)
iph->ttl = sk->ip_mc_ttl;
else
-#endif
iph->ttl = sk->ip_ttl;
- iph->protocol = type;
+ iph->protocol = sk->protocol;
iph->check = 0;
- iph->saddr = saddr;
- iph->daddr = daddr;
+ iph->saddr = rt->rt_src;
+ iph->daddr = rt->rt_dst;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
data += iph->ihl*4;
@@ -879,11 +671,9 @@
* User data callback
*/
- err = getfrag(frag, saddr, data, offset, fraglen-fragheaderlen);
+ err = getfrag(frag, data, offset, fraglen-fragheaderlen);
if (err)
- {
err = -EFAULT;
- }
/*
* Account for the fragment.
@@ -891,115 +681,271 @@
#ifdef CONFIG_FIREWALL
if(!err && !offset && call_out_firewall(PF_INET, skb->dev, iph, NULL) < FW_ACCEPT)
- {
- err = -EPERM;
- }
-#endif
+ err = -EPERM;
+#endif
if (err)
- {
+ {
kfree_skb(skb, FREE_WRITE);
dev_unlock_list();
return err;
- }
-
-#ifdef CONFIG_IP_ACCT
- if(!offset)
- ip_fw_chk(iph, dev, NULL, ip_acct_chain, 0, IP_FW_MODE_ACCT_OUT);
-#endif
+ }
offset -= (maxfraglen-fragheaderlen);
fraglen = maxfraglen;
-#ifdef CONFIG_IP_MULTICAST
+ nfrags++;
+
+ if (rt->u.dst.output(skb)) {
+ if (nfrags>1)
+ ip_statistics.IpFragCreates += nfrags;
+ dev_unlock_list();
+ return -ENETDOWN;
+ }
+ } while (offset >= 0);
+
+ if (nfrags>1)
+ ip_statistics.IpFragCreates += nfrags;
+
+ dev_unlock_list();
+ return 0;
+}
+
+/*
+ * This IP datagram is too large to be sent in one piece. Break it up into
+ * smaller pieces (each of size equal to the MAC header plus IP header plus
+ * a block of the data of the original IP data part) that will yet fit in a
+ * single device frame, and queue such a frame for sending.
+ *
+ * Assumption: packet was ready for transmission, link layer header
+ * is already in.
+ *
+ * Yes this is inefficient, feel free to submit a quicker one.
+ */
+
+void ip_fragment(struct sk_buff *skb, int local, int (*output)(struct sk_buff*))
+{
+ struct iphdr *iph;
+ unsigned char *raw;
+ unsigned char *ptr;
+ struct device *dev;
+ struct sk_buff *skb2;
+ int left, mtu, hlen, len;
+ int offset;
+ int not_last_frag;
+ u16 dont_fragment;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ dev = skb->dev;
+
+ /*
+ * Point into the IP datagram header.
+ */
+
+ raw = skb->data;
+ iph = skb->nh.iph;
+
+ /*
+ * Setup starting values.
+ */
+
+ hlen = iph->ihl * 4;
+ left = ntohs(iph->tot_len) - hlen; /* Space per frame */
+ hlen += skb->nh.raw - raw;
+ if (local)
+ mtu = rt->u.dst.pmtu - hlen; /* Size of data space */
+ else
+ mtu = dev->mtu - hlen;
+ ptr = raw + hlen; /* Where to start from */
+
+ /*
+ * The protocol doesn't seem to say what to do in the case that the
+ * frame + options doesn't fit the mtu. As it used to fall down dead
+ * in this case we were fortunate it didn't happen
+ */
+
+ if (mtu<8) {
+ ip_statistics.IpFragFails++;
+ kfree_skb(skb, FREE_WRITE);
+ return;
+ }
+
+ /*
+ * Fragment the datagram.
+ */
+
+ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ not_last_frag = iph->frag_off & htons(IP_MF);
+
+ /*
+ * Nice moment: if DF is set and we are here,
+ * it means that packet should be fragmented and
+ * DF is set on fragments. If it works,
+ * path MTU discovery can be done by ONE segment(!). --ANK
+ */
+ dont_fragment = iph->frag_off & htons(IP_DF);
+
+ /*
+ * Keep copying data until we run out.
+ */
+
+ while(left > 0) {
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending upto and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len/=8;
+ len*=8;
+ }
/*
- * Multicasts are looped back for other local users
+ * Allocate buffer.
*/
-
- if (MULTICAST(daddr) && !(dev->flags&IFF_LOOPBACK))
- {
- /*
- * Loop back any frames. The check for IGMP_ALL_HOSTS is because
- * you are always magically a member of this group.
- *
- * Always loop back all host messages when running as a multicast router.
- */
-
- if(sk==NULL || sk->ip_mc_loop)
- {
- if(daddr==IGMP_ALL_HOSTS || (dev->flags&IFF_ALLMULTI))
- ip_loopback(dev,skb);
- else
- {
- struct ip_mc_list *imc=dev->ip_mc_list;
- while(imc!=NULL)
- {
- if(imc->multiaddr==daddr)
- {
- ip_loopback(dev,skb);
- break;
- }
- imc=imc->next;
- }
- }
- }
- /*
- * Multicasts with ttl 0 must not go beyond the host. Fixme: avoid the
- * extra clone.
- */
-
- if(skb->ip_hdr->ttl==0)
- {
- kfree_skb(skb, FREE_WRITE);
- nfrags++;
- continue;
- }
+ if ((skb2 = alloc_skb(len+hlen+15,GFP_ATOMIC)) == NULL) {
+ NETDEBUG(printk("IP: frag: no memory for new fragment!\n"));
+ ip_statistics.IpFragFails++;
+ kfree_skb(skb, FREE_WRITE);
+ return;
}
-#endif
- nfrags++;
-
/*
- * BSD loops broadcasts
+ * Set up data on packet
*/
-
- if((dev->flags&IFF_BROADCAST) && (daddr==0xFFFFFFFF || daddr==dev->pa_brdaddr) && !(dev->flags&IFF_LOOPBACK))
- ip_loopback(dev,skb);
+
+ skb2->arp = skb->arp;
+ skb2->dev = skb->dev;
+ skb2->when = skb->when;
+ skb2->pkt_type = skb->pkt_type;
+ skb2->priority = skb->priority;
+ skb_put(skb2, len + hlen);
+ skb2->mac.raw = (char *) skb2->data;
+ skb2->nh.raw = skb2->mac.raw + dev->hard_header_len;
+ skb2->h.raw = skb2->mac.raw + hlen;
/*
- * Now queue the bytes into the device.
+ * Charge the memory for the fragment to any owner
+ * it might possess
*/
-
- if (dev->flags & IFF_UP)
- {
- dev_queue_xmit(skb, dev, sk->priority);
- }
- else
- {
- /*
- * Whoops...
- */
-
- ip_statistics.IpOutDiscards++;
- if(nfrags>1)
- ip_statistics.IpFragCreates+=nfrags;
- kfree_skb(skb, FREE_WRITE);
- dev_unlock_list();
- /*
- * BSD behaviour.
- */
- if(sk!=NULL)
- sk->err=ENETDOWN;
- return(0); /* lose rest of fragments */
- }
- }
- while (offset >= 0);
- if(nfrags>1)
- ip_statistics.IpFragCreates+=nfrags;
- dev_unlock_list();
- return(0);
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ skb2->dst = dst_clone(skb->dst);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ memcpy(skb2->mac.raw, raw, hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ memcpy(skb2->h.raw, ptr, len);
+ left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = skb2->nh.iph;
+ iph->frag_off = htons((offset >> 3))|dont_fragment;
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (offset == 0)
+ ip_options_fragment(skb2);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (left > 0 || not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ ptr += len;
+ offset += len;
+
+ /*
+ * Put this fragment into the sending queue.
+ */
+
+ ip_statistics.IpFragCreates++;
+
+ iph->tot_len = htons(len + hlen - dev->hard_header_len);
+
+ ip_send_check(iph);
+
+ output(skb2);
+ }
+ kfree_skb(skb, FREE_WRITE);
+ ip_statistics.IpFragOKs++;
+}
+
+struct sk_buff * ip_reply(struct sk_buff *skb, int payload)
+{
+ struct {
+ struct ip_options opt;
+ char data[40];
+ } replyopts;
+
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct sk_buff *reply;
+ int iphlen;
+ struct iphdr *iph;
+
+ struct ipcm_cookie ipc;
+ u32 daddr;
+
+ if (ip_options_echo(&replyopts.opt, skb))
+ return NULL;
+
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = &replyopts.opt;
+ if (ipc.opt->srr)
+ daddr = replyopts.opt.faddr;
+
+ if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), NULL))
+ return NULL;
+
+ iphlen = sizeof(struct iphdr) + replyopts.opt.optlen;
+ reply = alloc_skb(rt->u.dst.dev->hard_header_len+15+iphlen+payload, GFP_ATOMIC);
+ if (reply == NULL) {
+ ip_rt_put(rt);
+ return NULL;
+ }
+
+ reply->priority = skb->priority;
+ reply->dst = &rt->u.dst;
+
+ ip_ll_header_reserve(reply);
+
+ /*
+ * Now build the IP header.
+ */
+
+ /*
+ * Build the IP addresses
+ */
+
+ reply->nh.iph = iph = (struct iphdr *)skb_put(reply, iphlen);
+
+ iph->version = 4;
+ iph->ihl = iphlen>>2;
+ iph->tos = skb->nh.iph->tos;
+ iph->frag_off = 0;
+ iph->ttl = MAXTTL;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = skb->nh.iph->protocol;
+
+ ip_options_build(reply, &replyopts.opt, daddr, rt->u.dst.dev->pa_addr, 0);
+
+ return reply;
}
-
/*
* IP protocol layer initialiser
@@ -1007,80 +953,40 @@
static struct packet_type ip_packet_type =
{
- 0, /* MUTTER ntohs(ETH_P_IP),*/
+ __constant_htons(ETH_P_IP),
NULL, /* All devices */
ip_rcv,
NULL,
NULL,
};
-#ifdef CONFIG_RTNETLINK
-
-/*
- * Netlink hooks for IP
- */
-
-void ip_netlink_msg(unsigned long msg, __u32 daddr, __u32 gw, __u32 mask, short flags, short metric, char *name)
-{
- struct sk_buff *skb=alloc_skb(sizeof(struct netlink_rtinfo), GFP_ATOMIC);
- struct netlink_rtinfo *nrt;
- struct sockaddr_in *s;
- if(skb==NULL)
- return;
- skb->free=1;
- nrt=(struct netlink_rtinfo *)skb_put(skb, sizeof(struct netlink_rtinfo));
- nrt->rtmsg_type=msg;
- s=(struct sockaddr_in *)&nrt->rtmsg_dst;
- s->sin_family=AF_INET;
- s->sin_addr.s_addr=daddr;
- s=(struct sockaddr_in *)&nrt->rtmsg_gateway;
- s->sin_family=AF_INET;
- s->sin_addr.s_addr=gw;
- s=(struct sockaddr_in *)&nrt->rtmsg_genmask;
- s->sin_family=AF_INET;
- s->sin_addr.s_addr=mask;
- nrt->rtmsg_flags=flags;
- nrt->rtmsg_metric=metric;
- strcpy(nrt->rtmsg_device,name);
- if (netlink_post(NETLINK_ROUTE, skb))
- kfree_skb(skb, FREE_WRITE);
-}
-
-#endif
/*
* Device notifier
*/
-static int ip_rt_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int ip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct device *dev=ptr;
- if(event==NETDEV_DOWN)
- {
- ip_netlink_msg(RTMSG_DELDEVICE, 0,0,0,0,0,dev->name);
- ip_rt_flush(dev);
- }
+
+ if (dev->family != AF_INET)
+ return NOTIFY_DONE;
+
+ if(event==NETDEV_UP) {
/*
* Join the initial group if multicast.
*/
- if(event==NETDEV_UP)
- {
-#ifdef CONFIG_IP_MULTICAST
ip_mc_allhost(dev);
-#endif
- ip_netlink_msg(RTMSG_NEWDEVICE, 0,0,0,0,0,dev->name);
- ip_rt_update(NETDEV_UP, dev);
}
- return NOTIFY_DONE;
+ return ip_rt_event(event, dev);
}
-struct notifier_block ip_rt_notifier={
- ip_rt_event,
+struct notifier_block ip_netdev_notifier={
+ ip_netdev_event,
NULL,
0
};
-#ifdef CONFIG_IP_MULTICAST
#ifdef CONFIG_PROC_FS
static struct proc_dir_entry proc_net_igmp = {
PROC_NET_IGMP, 4, "igmp",
@@ -1089,7 +995,6 @@
ip_mc_procinfo
};
#endif
-#endif
/*
* IP registers the packet type and then calls the subprotocol initialisers
@@ -1097,21 +1002,15 @@
void ip_init(void)
{
- ip_packet_type.type=htons(ETH_P_IP);
dev_add_pack(&ip_packet_type);
- /* So we flush routes when a device is downed */
- register_netdevice_notifier(&ip_rt_notifier);
+ ip_rt_init();
-/* ip_raw_init();
- ip_packet_init();
- ip_tcp_init();
- ip_udp_init();*/
+ /* So we flush routes and multicast lists when a device is downed */
+ register_netdevice_notifier(&ip_netdev_notifier);
-#ifdef CONFIG_IP_MULTICAST
#ifdef CONFIG_PROC_FS
proc_net_register(&proc_net_igmp);
#endif
-#endif
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov