patch-2.1.91 linux/net/ipv4/tcp_ipv4.c

Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index
Lines: 958
Date: Mon Mar 23 16:48:25 1998
Orig file: v2.1.90/linux/net/ipv4/tcp_ipv4.c
Orig date: Tue Mar 17 22:18:16 1998

diff -u --recursive --new-file v2.1.90/linux/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.109 1998/03/15 07:24:15 davem Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.119 1998/03/22 19:14:47 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -62,16 +62,12 @@
 
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
+extern int sysctl_tcp_sack;
 extern int sysctl_tcp_syncookies;
 extern int sysctl_ip_dynaddr;
 
 /* Check TCP sequence numbers in ICMP packets. */
-#define ICMP_PARANOIA 1 
-#ifndef ICMP_PARANOIA
-#define ICMP_MIN_LENGTH 4
-#else
 #define ICMP_MIN_LENGTH 8
-#endif
 
 static void tcp_v4_send_reset(struct sk_buff *skb);
 
@@ -120,7 +116,7 @@
 	__u32 laddr = sk->rcv_saddr;
 	__u16 lport = sk->num;
 	__u32 faddr = sk->daddr;
-	__u16 fport = sk->dummy_th.dest;
+	__u16 fport = sk->dport;
 
 	return tcp_hashfn(laddr, lport, faddr, fport);
 }
@@ -365,7 +361,7 @@
 	sk = TCP_RHASH(sport);
 	if(sk						&&
 	   sk->daddr		== saddr		&& /* remote address */
-	   sk->dummy_th.dest	== sport		&& /* remote port    */
+	   sk->dport		== sport		&& /* remote port    */
 	   sk->num		== hnum			&& /* local port     */
 	   sk->rcv_saddr	== daddr		&& /* local address  */
 	   (!sk->bound_dev_if || sk->bound_dev_if == dif))
@@ -377,7 +373,7 @@
 	hash = tcp_hashfn(daddr, hnum, saddr, sport);
 	for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
 		if(sk->daddr		== saddr		&& /* remote address */
-		   sk->dummy_th.dest	== sport		&& /* remote port    */
+		   sk->dport		== sport		&& /* remote port    */
 		   sk->num		== hnum			&& /* local port     */
 		   sk->rcv_saddr	== daddr		&& /* local address  */
 		   (!sk->bound_dev_if || sk->bound_dev_if == dif)) {
@@ -389,7 +385,7 @@
 	/* Must check for a TIME_WAIT'er before going to listener hash. */
 	for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) {
 		if(sk->daddr		== saddr		&& /* remote address */
-		   sk->dummy_th.dest	== sport		&& /* remote port    */
+		   sk->dport		== sport		&& /* remote port    */
 		   sk->num		== hnum			&& /* local port     */
 		   sk->rcv_saddr	== daddr		&& /* local address  */
 		   (!sk->bound_dev_if || sk->bound_dev_if == dif))
@@ -456,8 +452,8 @@
 				continue;
 			score++;
 		}
-		if(s->dummy_th.dest) {
-			if(s->dummy_th.dest != rnum)
+		if(s->dport) {
+			if(s->dport != rnum)
 				continue;
 			score++;
 		}
@@ -496,12 +492,7 @@
 					  skb->h.th->source);
 }
 
-/*
- *	From tcp.c
- */
-
-/*
- * Check that a TCP address is unique, don't allow multiple
+/* Check that a TCP address is unique, don't allow multiple
  * connects to/from the same address.  Actually we can optimize
  * quite a bit, since the socket about to connect is still
  * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
@@ -509,8 +500,7 @@
  * The good_socknum and verify_bind scheme we use makes this
  * work.
  */
-
-static int tcp_unique_address(struct sock *sk)
+static int tcp_v4_unique_address(struct sock *sk)
 {
 	struct tcp_bind_bucket *tb;
 	unsigned short snum = sk->num;
@@ -524,7 +514,7 @@
 			/* Almost certainly the re-use port case, search the real hashes
 			 * so it actually scales.
 			 */
-			sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest,
+			sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
 					     sk->rcv_saddr, snum, sk->bound_dev_if);
 			if((sk != NULL) && (sk->state != TCP_LISTEN))
 				retval = 0;
@@ -535,19 +525,15 @@
 	return retval;
 }
 
-
-/*
- *	This will initiate an outgoing connection. 
- */
- 
+/* This will initiate an outgoing connection. */
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
-	struct sk_buff *buff;
-	int tmp;
-	struct tcphdr *th;
-	struct rtable *rt;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+	struct sk_buff *buff;
+	struct rtable *rt;
+	int tmp;
+	int mss;
 
 	if (sk->state != TCP_CLOSE) 
 		return(-EISCONN);
@@ -567,8 +553,6 @@
 			printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
 	}
 
-	dst_release(xchg(&sk->dst_cache, NULL));
-
 	tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
 			       RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if);
 	if (tmp < 0)
@@ -579,143 +563,52 @@
 		return -ENETUNREACH;
 	}
 
-	if (!tcp_unique_address(sk)) {
-		ip_rt_put(rt);
-		return -EADDRNOTAVAIL;
-	}
+	dst_release(xchg(&sk->dst_cache, rt));
 
-	lock_sock(sk);
-
-	/* Do this early, so there is less state to unwind on failure. */
-	buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)),
+	buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 			    0, GFP_KERNEL);
-	if (buff == NULL) {
-		release_sock(sk);
-		ip_rt_put(rt);
-		return(-ENOBUFS);
-	}
 
-	sk->dst_cache = &rt->u.dst;
+	if (buff == NULL)
+		return -ENOBUFS;
+
+	/* Socket has no identity, so lock_sock() is useless.  Also
+	 * since state==TCP_CLOSE (checked above) the socket cannot
+	 * possibly be in the hashes.  TCP hash locking is only
+	 * needed while checking quickly for a unique address.
+	 * However, the socket does need to be (and is) locked
+	 * in tcp_connect().
+	 * Perhaps this addresses all of ANK's concerns. 8-)  -DaveM
+	 */
+	sk->dport = usin->sin_port;
 	sk->daddr = rt->rt_dst;
 	if (!sk->saddr)
 		sk->saddr = rt->rt_src;
 	sk->rcv_saddr = sk->saddr;
 
-	if (sk->priority == 0)
-		sk->priority = rt->u.dst.priority;
-
-	sk->dummy_th.dest = usin->sin_port;
-
-	tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
-						   sk->dummy_th.source,
-						   usin->sin_port);
-	tp->snd_wnd = 0;
-	tp->snd_wl1 = 0;
-	tp->snd_wl2 = tp->write_seq;
-	tp->snd_una = tp->write_seq;
-	tp->rcv_nxt = 0;
-
-	sk->err = 0;
-	
-	/* Put in the IP header and routing stuff. */
-	tmp = ip_build_header(buff, sk);
-	if (tmp < 0) {
-		/* Caller has done ip_rt_put(rt) and set sk->dst_cache
-		 * to NULL.  We must unwind the half built TCP socket
-		 * state so that this failure does not create a "stillborn"
-		 * sock (ie. future re-tries of connect() would fail).
-		 */
-		sk->daddr = 0;
-		sk->saddr = sk->rcv_saddr = 0;
+	if (!tcp_v4_unique_address(sk)) {
 		kfree_skb(buff);
-		release_sock(sk);
-		return(-ENETUNREACH);
+		return -EADDRNOTAVAIL;
 	}
 
-	/* No failure conditions can result past this point. */
-
-	/* We'll fix this up when we get a response from the other end.
-	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
-	 */
-	tp->tcp_header_len = sizeof(struct tcphdr) +
-		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
-
-	th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
-	buff->h.th = th;
-
-	memcpy(th,(void *)&(sk->dummy_th), sizeof(*th));
-	/* th->doff gets fixed up below if we tack on options. */
-
-	buff->seq = tp->write_seq++;
-	th->seq = htonl(buff->seq);
-	tp->snd_nxt = tp->write_seq;
-	buff->end_seq = tp->write_seq;
-	th->ack = 0;
-	th->syn = 1;
-
 	sk->mtu = rt->u.dst.pmtu;
 	if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
 	     (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
 	      (rt->u.dst.mxlock&(1<<RTAX_MTU)))) &&
-	    rt->u.dst.pmtu > 576)
+	    rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway)
 		sk->mtu = 576;
 
-	if(sk->mtu < 64)
+	if (sk->mtu < 64)
 		sk->mtu = 64;	/* Sanity limit */
 
-	sk->mss = (sk->mtu - sizeof(struct iphdr) - tp->tcp_header_len);
-	if(sk->user_mss)
-		sk->mss = min(sk->mss, sk->user_mss);
-
-	if (sk->mss < 1) {
-		printk(KERN_DEBUG "intial sk->mss below 1\n");
-		sk->mss = 1;	/* Sanity limit */
-	}
-
-	tp->window_clamp = rt->u.dst.window;
-	tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
-		&tp->rcv_wnd,
-		&tp->window_clamp,
-		sysctl_tcp_window_scaling,
-		&tp->rcv_wscale);
-	th->window = htons(tp->rcv_wnd);
-
-	tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps,
-		sysctl_tcp_window_scaling, tp->rcv_wscale);
-	buff->csum = 0;
-	th->doff = (sizeof(*th)+ tmp)>>2;
-
-	tcp_v4_send_check(sk, th, sizeof(struct tcphdr) + tmp, buff);
-
-	tcp_set_state(sk,TCP_SYN_SENT);
-
-	/* Socket identity change complete, no longer
-	 * in TCP_CLOSE, so enter ourselves into the
-	 * hash tables.
-	 */
-	tcp_v4_hash(sk);
+	mss = sk->mtu - sizeof(struct iphdr);
+	if (sk->opt)
+		mss -= sk->opt->optlen;
 
-	tp->rto = rt->u.dst.rtt;
-
-	tcp_init_xmit_timers(sk);
-
-	/* Now works the right way instead of a hacked initial setting. */
-	tp->retransmits = 0;
-
-	skb_queue_tail(&sk->write_queue, buff);
-
-	tp->packets_out++;
-	buff->when = jiffies;
-
-	ip_queue_xmit(skb_clone(buff, GFP_KERNEL));
+	tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+						   sk->sport, usin->sin_port);
 
-	/* Timer for repeating the SYN until an answer. */
-	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-	tcp_statistics.TcpActiveOpens++;
-	tcp_statistics.TcpOutSegs++;
-  
-	release_sock(sk);
-	return(0);
+	tcp_connect(sk, buff, mss);
+	return 0;
 }
 
 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
@@ -724,7 +617,7 @@
 	int retval = -EINVAL;
 
 	/* Do sanity checking for sendmsg/sendto/send. */
-	if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT))
+	if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
 		goto out;
 	if (msg->msg_name) {
 		struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
@@ -737,7 +630,7 @@
 		if(sk->state == TCP_CLOSE)
 			goto out;
 		retval = -EISCONN;
-		if (addr->sin_port != sk->dummy_th.dest)
+		if (addr->sin_port != sk->dport)
 			goto out;
 		if (addr->sin_addr.s_addr != sk->daddr)
 			goto out;
@@ -851,9 +744,7 @@
 	int code = skb->h.icmph->code;
 	struct sock *sk;
 	int opening;
-#ifdef ICMP_PARANOIA
 	__u32 seq;
-#endif
 
 	if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { 
 		icmp_statistics.IcmpInErrors++; 
@@ -869,7 +760,6 @@
 	}
 
 	tp = &sk->tp_pinfo.af_tcp;
-#ifdef ICMP_PARANOIA
 	seq = ntohl(th->seq);
 	if (sk->state != TCP_LISTEN && 
    	    !between(seq, tp->snd_una, max(tp->snd_una+32768,tp->snd_nxt))) {
@@ -879,7 +769,6 @@
 			       (int)sk->state, seq, tp->snd_una, tp->snd_nxt); 
 		return; 
 	}
-#endif
 
 	switch (type) {
 	case ICMP_SOURCE_QUENCH:
@@ -927,7 +816,6 @@
 		req = tcp_v4_search_req(tp, iph, th, &prev); 
 		if (!req)
 			return;
-#ifdef ICMP_PARANOIA
 		if (seq != req->snt_isn) {
 			if (net_ratelimit())
 				printk(KERN_DEBUG "icmp packet for openreq "
@@ -935,7 +823,6 @@
 				       seq, req->snt_isn);
 			return;
 		}
-#endif
 		if (req->sk) {	/* not yet accept()ed */
 			sk = req->sk; /* report error in accept */
 		} else {
@@ -987,44 +874,50 @@
 
 static void tcp_v4_send_reset(struct sk_buff *skb)
 {
-	struct tcphdr  *th = skb->h.th;
-	struct sk_buff *skb1;
-	struct tcphdr  *th1;
+	struct tcphdr *th = skb->h.th;
 
-	if (th->rst)
-		return;
+	/* Never send a reset in response to a reset. */
+	if (th->rst == 0) {
+		struct tcphdr  *th = skb->h.th;
+		struct sk_buff *skb1 = ip_reply(skb, sizeof(struct tcphdr));
+		struct tcphdr  *th1;
 
-	skb1 = ip_reply(skb, sizeof(struct tcphdr));
-	if (skb1 == NULL)
-		return;
+		if (skb1 == NULL)
+			return;
  
-	skb1->h.th = th1 = (struct tcphdr *)skb_put(skb1, sizeof(struct tcphdr));
-	memset(th1, 0, sizeof(*th1));
+		skb1->h.th = th1 = (struct tcphdr *)
+			skb_put(skb1, sizeof(struct tcphdr));
 
-	/* Swap the send and the receive. */
-	th1->dest = th->source;
-	th1->source = th->dest;
-	th1->doff = sizeof(*th1)/4;
-	th1->rst = 1;
-
-	if (th->ack)
-	  	th1->seq = th->ack_seq;
-	else {
-		th1->ack = 1;
-	  	if (!th->syn)
-			th1->ack_seq = th->seq;
-		else
-			th1->ack_seq = htonl(ntohl(th->seq)+1);
-	}
+		/* Swap the send and the receive. */
+		memset(th1, 0, sizeof(*th1));
+		th1->dest = th->source;
+		th1->source = th->dest;
+		th1->doff = sizeof(*th1)/4;
+		th1->rst = 1;
+
+		if (th->ack) {
+			th1->seq = th->ack_seq;
+		} else {
+			th1->ack = 1;
+			if (!th->syn)
+				th1->ack_seq = th->seq;
+			else
+				th1->ack_seq = htonl(ntohl(th->seq)+1);
+		}
+		skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0);
+		th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr,
+					  skb1->nh.iph->daddr, skb1->csum);
+
+		/* Finish up some IP bits. */
+		skb1->nh.iph->tot_len = htons(skb1->len);
+		ip_send_check(skb1->nh.iph);
 
-	skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0);
-	th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr,
-				  skb1->nh.iph->daddr, skb1->csum);
+		/* All the other work was done by ip_reply(). */
+		skb1->dst->output(skb1);
 
-	/* Do not place TCP options in a reset. */
-	ip_queue_xmit(skb1);
-	tcp_statistics.TcpOutSegs++;
-	tcp_statistics.TcpOutRsts++;
+		tcp_statistics.TcpOutSegs++;
+		tcp_statistics.TcpOutRsts++;
+	}
 }
 
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
@@ -1055,82 +948,48 @@
 
 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 {
+	struct rtable *rt;
+	struct ip_options *opt;
 	struct sk_buff * skb;
-	struct tcphdr *th;
-	int tmp;
 	int mss;
 
-	skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
-	if (skb == NULL)
+	/* First, grab a route. */
+	opt = req->af.v4_req.opt;
+	if(ip_route_output(&rt, ((opt && opt->srr) ?
+				 opt->faddr :
+				 req->af.v4_req.rmt_addr),
+			   req->af.v4_req.loc_addr,
+			   RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
+			   sk->bound_dev_if)) {
+		ip_statistics.IpOutNoRoutes++;
 		return;
-
-	if(ip_build_pkt(skb, sk, req->af.v4_req.loc_addr,
-			req->af.v4_req.rmt_addr, req->af.v4_req.opt) < 0) {
-		kfree_skb(skb);
+	}
+	if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+		ip_rt_put(rt);
+		ip_statistics.IpOutNoRoutes++;
 		return;
 	}
-	
-	mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
-	if (sk->user_mss)
-		mss = min(mss, sk->user_mss);
-	if(req->tstamp_ok)
-		mss -= TCPOLEN_TSTAMP_ALIGNED;
-	else
-		req->mss += TCPOLEN_TSTAMP_ALIGNED;
 
-	/* tcp_syn_build_options will do an skb_put() to obtain the TCP
-	 * options bytes below.
-	 */
-	skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr));
-
-	/* Don't offer more than they did.
-	 * This way we don't have to memorize who said what.
-	 * FIXME: maybe this should be changed for better performance
-	 * with syncookies.
-	 */
-	req->mss = min(mss, req->mss);
-
-	if (req->mss < 1) {
-		printk(KERN_DEBUG "initial req->mss below 1\n");
-		req->mss = 1;
-	}
+	mss = (rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
+	if (opt)
+		mss -= opt->optlen;
+
+	skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
+	if (skb) {
+		struct tcphdr *th = skb->h.th;
 
-	/* Yuck, make this header setup more efficient... -DaveM */
-	memset(th, 0, sizeof(struct tcphdr));
-	th->syn = 1;
-	th->ack = 1;
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
-	th->source = req->lcl_port; /* LVE */
-#else
-	th->source = sk->dummy_th.source;
+		th->source = req->lcl_port; /* LVE */
 #endif
-	th->dest = req->rmt_port;
-	skb->seq = req->snt_isn;
-	skb->end_seq = skb->seq + 1;
-	th->seq = htonl(skb->seq);
-	th->ack_seq = htonl(req->rcv_isn + 1);
-	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
-		__u8 rcv_wscale; 
-		/* Set this up on the first call only */
-		req->window_clamp = skb->dst->window;
-		tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
-			&req->rcv_wnd,
-			&req->window_clamp,
-			req->wscale_ok,
-			&rcv_wscale);
-		req->rcv_wscale = rcv_wscale; 
-	}
-	th->window = htons(req->rcv_wnd);
-	tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok,
-		req->wscale_ok,req->rcv_wscale);
-	skb->csum = 0;
-	th->doff = (sizeof(*th) + tmp)>>2;
-	th->check = tcp_v4_check(th, sizeof(*th) + tmp,
-				 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
-				 csum_partial((char *)th, sizeof(*th)+tmp, skb->csum));
 
-	ip_queue_xmit(skb);
-	tcp_statistics.TcpOutSegs++;
+		th->check = tcp_v4_check(th, skb->len,
+					 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
+					 csum_partial((char *)th, skb->len, skb->csum));
+
+		ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+				      req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+	}
+	ip_rt_put(rt);
 }
 
 static void tcp_v4_or_free(struct open_request *req)
@@ -1240,15 +1099,16 @@
 	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
 
 	req->rcv_isn = skb->seq;
- 	tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0;
+ 	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
 	tp.in_mss = 536;
-	tcp_parse_options(th,&tp,want_cookie);
+	tcp_parse_options(NULL, th, &tp, want_cookie);
 	req->mss = tp.in_mss;
 	if (tp.saw_tstamp) {
 		req->mss -= TCPOLEN_TSTAMP_ALIGNED;
 		req->ts_recent = tp.rcv_tsval;
 	}
 	req->tstamp_ok = tp.tstamp_ok;
+	req->sack_ok = tp.sack_ok;
 	req->snd_wscale = tp.snd_wscale;
 	req->wscale_ok = tp.wscale_ok;
 	req->rmt_port = th->source;
@@ -1300,8 +1160,11 @@
 
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * This function wants to be moved to a common for IPv[46] file. --ANK
  */
-struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb,
+				      int snd_mss)
 {
 	struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0);
 
@@ -1310,27 +1173,16 @@
 
 		memcpy(newsk, sk, sizeof(*newsk));
 		newsk->sklist_next = NULL;
-		newsk->daddr = req->af.v4_req.rmt_addr;
-		newsk->rcv_saddr = req->af.v4_req.loc_addr;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-		newsk->num = ntohs(skb->h.th->dest);
-#endif
 		newsk->state = TCP_SYN_RECV;
 
 		/* Clone the TCP header template */
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-		newsk->dummy_th.source = req->lcl_port;
-#endif
-		newsk->dummy_th.dest = req->rmt_port;
-		newsk->dummy_th.ack = 1;
-		newsk->dummy_th.doff = sizeof(struct tcphdr)>>2;
+		newsk->dport = req->rmt_port;
 
 		newsk->sock_readers = 0;
 		atomic_set(&newsk->rmem_alloc, 0);
 		skb_queue_head_init(&newsk->receive_queue);
 		atomic_set(&newsk->wmem_alloc, 0);
 		skb_queue_head_init(&newsk->write_queue);
-		newsk->saddr = req->af.v4_req.loc_addr;
 
 		newsk->done = 0;
 		newsk->proc = 0;
@@ -1395,12 +1247,40 @@
 		newsk->priority = 1;
 
 		/* IP layer stuff */
-		newsk->opt = req->af.v4_req.opt;
 		newsk->timeout = 0;
 		init_timer(&newsk->timer);
 		newsk->timer.function = &net_timer;
 		newsk->timer.data = (unsigned long) newsk;
 		newsk->socket = NULL;
+
+		newtp->tstamp_ok = req->tstamp_ok;
+		if((newtp->sack_ok = req->sack_ok) != 0)
+			newtp->num_sacks = 0;
+		newtp->window_clamp = req->window_clamp;
+		newtp->rcv_wnd = req->rcv_wnd;
+		newtp->wscale_ok = req->wscale_ok;
+		if (newtp->wscale_ok) {
+			newtp->snd_wscale = req->snd_wscale;
+			newtp->rcv_wscale = req->rcv_wscale;
+		} else {
+			newtp->snd_wscale = newtp->rcv_wscale = 0;
+			newtp->window_clamp = min(newtp->window_clamp,65535);
+		}
+		if (newtp->tstamp_ok) {
+			newtp->ts_recent = req->ts_recent;
+			newtp->ts_recent_stamp = jiffies;
+			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+		} else {
+			newtp->tcp_header_len = sizeof(struct tcphdr);
+		}
+
+		snd_mss -= newtp->tcp_header_len;
+
+		if (sk->user_mss)
+			snd_mss = min(snd_mss, sk->user_mss);
+
+		newsk->mss = min(req->mss, snd_mss);
+
 	}
 	return newsk;
 }
@@ -1409,77 +1289,58 @@
 				   struct open_request *req,
 				   struct dst_entry *dst)
 {
+	struct ip_options *opt = req->af.v4_req.opt;
 	struct tcp_opt *newtp;
 	struct sock *newsk;
 	int snd_mss;
+	int mtu;
 
 #ifdef NEW_LISTEN
 	if (sk->ack_backlog > sk->max_ack_backlog)
 		goto exit; /* head drop */
 #endif
-	newsk = tcp_create_openreq_child(sk, req, skb);
-	if (!newsk) 
-		goto exit;
-#ifdef NEW_LISTEN
-	sk->ack_backlog++;
-#endif
-
-	newtp = &(newsk->tp_pinfo.af_tcp);
-
-	/* options / mss / route_cache */
 	if (dst == NULL) { 
 		struct rtable *rt;
 		
 		if (ip_route_output(&rt,
-				    newsk->opt && newsk->opt->srr ? 
-				    newsk->opt->faddr : newsk->daddr,
-				    newsk->saddr, newsk->ip_tos|RTO_CONN, 0)) {
-			sk_free(newsk);
+				    opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
+				    req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0))
 			return NULL;
-		}
 	        dst = &rt->u.dst;
-	} 
-	newsk->dst_cache = dst;
-	
-	snd_mss = dst->pmtu;
-
-	/* FIXME: is mtu really the same as snd_mss? */
-	newsk->mtu = snd_mss;
-	/* FIXME: where does mtu get used after this? */
-	/* sanity check */
-	if (newsk->mtu < 64)
-		newsk->mtu = 64;
-
-	newtp->tstamp_ok = req->tstamp_ok;
-	newtp->window_clamp = req->window_clamp;
-	newtp->rcv_wnd = req->rcv_wnd;
-	newtp->wscale_ok = req->wscale_ok;
-	if (newtp->wscale_ok) {
-		newtp->snd_wscale = req->snd_wscale;
-		newtp->rcv_wscale = req->rcv_wscale;
-	} else {
-		newtp->snd_wscale = newtp->rcv_wscale = 0;
-		newtp->window_clamp = min(newtp->window_clamp,65535);
-	}
-	if (newtp->tstamp_ok) {
-		newtp->ts_recent = req->ts_recent;
-		newtp->ts_recent_stamp = jiffies;
-		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
-		newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
-	} else {
-		newtp->tcp_header_len = sizeof(struct tcphdr);
 	}
 
-	snd_mss -= sizeof(struct iphdr) + sizeof(struct tcphdr);
-	if (sk->user_mss)
-		snd_mss = min(snd_mss, sk->user_mss);
+#ifdef NEW_LISTEN
+	sk->ack_backlog++;
+#endif
+
+	mtu = dst->pmtu;
+	if (mtu < 68)
+		mtu = 68;
+	snd_mss = mtu - sizeof(struct iphdr);
+	if (opt)
+		snd_mss -= opt->optlen;
+
+	newsk = tcp_create_openreq_child(sk, req, skb, snd_mss);
+	if (!newsk) 
+		goto exit;
+
+	newsk->dst_cache = dst;
 
-	/* Make sure our mtu is adjusted for headers. */
-	newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len;
+	newtp = &(newsk->tp_pinfo.af_tcp);
+	newsk->daddr = req->af.v4_req.rmt_addr;
+	newsk->saddr = req->af.v4_req.loc_addr;
+	newsk->rcv_saddr = req->af.v4_req.loc_addr;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+	newsk->num = ntohs(skb->h.th->dest);
+	newsk->sport = req->lcl_port;
+#endif
+	newsk->opt = req->af.v4_req.opt;
+	newsk->mtu = mtu;
 
 	/* Must use the af_specific ops here for the case of IPv6 mapped. */
 	newsk->prot->hash(newsk);
 	add_to_prot_sklist(newsk);
+
 	return newsk;
 
 exit:
@@ -1677,106 +1538,82 @@
 	goto discard_it;
 }
 
-int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb)
+int tcp_v4_rebuild_header(struct sock *sk)
 {
-	return ip_build_header(skb, sk);
-}
-
-int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb)
-{
-	struct rtable *rt;
-	struct iphdr *iph;
-	struct tcphdr *th;
-	int size;
+	struct rtable *rt = (struct rtable *)sk->dst_cache;
+	__u32 new_saddr;
         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
 
-	/* Check route */
-
-	rt = (struct rtable*)skb->dst;
+	if(rt == NULL)
+		return 0;
 
-	/* Force route checking if want_rewrite */
-	/* The idea is good, the implementation is disguisting.
-	   Well, if I made bind on this socket, you cannot randomly ovewrite
-	   its source address. --ANK
+	/* Force route checking if want_rewrite.
+	 * The idea is good, the implementation is disguisting.
+	 * Well, if I made bind on this socket, you cannot randomly ovewrite
+	 * its source address. --ANK
 	 */
 	if (want_rewrite) {
 		int tmp;
+		struct rtable *new_rt;
 		__u32 old_saddr = rt->rt_src;
 
-		/* Query new route */
-		tmp = ip_route_connect(&rt, rt->rt_dst, 0, 
+		/* Query new route using another rt buffer */
+		tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
 					RT_TOS(sk->ip_tos)|sk->localroute,
 					sk->bound_dev_if);
 
 		/* Only useful if different source addrs */
-		if (tmp == 0 || rt->rt_src != old_saddr ) {
-			dst_release(skb->dst);
-			skb->dst = &rt->u.dst;
-		} else {
-			want_rewrite = 0;
-			dst_release(&rt->u.dst);
+		if (tmp == 0) {
+			/*
+			 *	Only useful if different source addrs
+			 */
+			if (new_rt->rt_src != old_saddr ) {
+				dst_release(sk->dst_cache);
+				sk->dst_cache = &new_rt->u.dst;
+				rt = new_rt;
+				goto do_rewrite;
+			} 
+			dst_release(&new_rt->u.dst);
 		}
-	} else 
+	}
 	if (rt->u.dst.obsolete) {
 		int err;
 		err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
 		if (err) {
 			sk->err_soft=-err;
-			sk->error_report(skb->sk);
+			sk->error_report(sk);
 			return -1;
 		}
-		dst_release(skb->dst);
-		skb->dst = &rt->u.dst;
+		dst_release(xchg(&sk->dst_cache, &rt->u.dst));
 	}
 
-	iph = skb->nh.iph;
-	th = skb->h.th;
-	size = skb->tail - skb->h.raw;
+	return 0;
 
-        if (want_rewrite) {
-        	__u32 new_saddr = rt->rt_src;
+do_rewrite:
+	new_saddr = rt->rt_src;
                 
-                /*
-                 *	Ouch!, this should not happen.
-                 */
-                if (!sk->saddr || !sk->rcv_saddr) {
-                	printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: saddr=%08lX rcv_saddr=%08lX\n",
-			       ntohl(sk->saddr), 
-			       ntohl(sk->rcv_saddr));
-                        return 0;
-                }
-
-		/*
-		 *	Maybe whe are in a skb chain loop and socket address has
-		 *	yet been 'damaged'.
-		 */
-
-		if (new_saddr != sk->saddr) {
-			if (sysctl_ip_dynaddr > 1) {
-				printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
-					NIPQUAD(sk->saddr), 
-					NIPQUAD(new_saddr));
-			}
+	/* Ouch!, this should not happen. */
+	if (!sk->saddr || !sk->rcv_saddr) {
+		printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
+		       "saddr=%08lX rcv_saddr=%08lX\n",
+		       ntohl(sk->saddr), 
+		       ntohl(sk->rcv_saddr));
+		return 0;
+	}
 
-			sk->saddr = new_saddr;
-			sk->rcv_saddr = new_saddr;
-			/* sk->prot->rehash(sk); */
-			tcp_v4_rehash(sk);
-		} 
-
-		if (new_saddr != iph->saddr) {
-			if (sysctl_ip_dynaddr > 1) {
-				printk(KERN_INFO "tcp_v4_rebuild_header(): shifting iph->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
-					NIPQUAD(iph->saddr), 
-					NIPQUAD(new_saddr));
-			}
+	if (new_saddr != sk->saddr) {
+		if (sysctl_ip_dynaddr > 1) {
+			printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
+			       "from %d.%d.%d.%d to %d.%d.%d.%d\n",
+			       NIPQUAD(sk->saddr), 
+			       NIPQUAD(new_saddr));
+		}
 
-			iph->saddr = new_saddr;
-			ip_send_check(iph);
-		} 
+		sk->saddr = new_saddr;
+		sk->rcv_saddr = new_saddr;
+		tcp_v4_rehash(sk);
+	} 
         
-        }
-
 	return 0;
 }
 
@@ -1792,11 +1629,10 @@
 
 	sin->sin_family		= AF_INET;
 	sin->sin_addr.s_addr	= sk->daddr;
-	sin->sin_port		= sk->dummy_th.dest;
+	sin->sin_port		= sk->dport;
 }
 
 struct tcp_func ipv4_specific = {
-	tcp_v4_build_header,
 	ip_queue_xmit,
 	tcp_v4_send_check,
 	tcp_v4_rebuild_header,
@@ -1834,10 +1670,6 @@
 	sk->max_ack_backlog = SOMAXCONN;
 	sk->mtu = 576;
 	sk->mss = 536;
-
-	/* Speed up by setting some standard state for the dummy_th. */
-  	sk->dummy_th.ack=1;
-  	sk->dummy_th.doff=sizeof(struct tcphdr)>>2;
 
 	/* Init SYN queue. */
 	tcp_synq_init(tp);
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov