patch-2.4.4 linux/net/ipv4/tcp_input.c

Next file: linux/net/ipv4/tcp_ipv4.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.3/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.205 2000/12/13 18:31:48 davem Exp $
+ * Version:	$Id: tcp_input.c,v 1.228 2001/04/20 20:46:19 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -137,7 +137,7 @@
 		 *
 		 * "len" is invariant segment length, including TCP header.
 		 */
-		len = skb->tail - skb->h.raw;
+		len += skb->data - skb->h.raw;
 		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
 		    /* If PSH is not set, packet should be
 		     * full sized, provided peer TCP is not badly broken.
@@ -378,7 +378,8 @@
 		/* The _first_ data packet received, initialize
 		 * delayed ACK engine.
 		 */
-		tcp_enter_quickack_mode(tp);
+		tcp_incr_quickack(tp);
+		tp->ack.ato = TCP_ATO_MIN;
 	} else {
 		int m = now - tp->ack.lrcvtime;
 
@@ -510,7 +511,7 @@
 }
 
 /* Save metrics learned by this TCP session.
-   This function is called only, when TCP finishes sucessfully
+   This function is called only, when TCP finishes successfully
    i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
  */
 void tcp_update_metrics(struct sock *sk)
@@ -1016,7 +1017,7 @@
 			tp->fackets_out = cnt;
 		}
 	}
-	tp->left_out = tp->sacked_out + tp->lost_out;
+	tcp_sync_left_out(tp);
 
 	tp->reordering = min(tp->reordering, sysctl_tcp_reordering);
 	tp->ca_state = TCP_CA_Loss;
@@ -1052,6 +1053,15 @@
 	return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
 }
 
+static inline int tcp_skb_timedout(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+}
+
+static inline int tcp_head_timedout(struct sock *sk, struct tcp_opt *tp)
+{
+	return tp->packets_out && tcp_skb_timedout(tp, skb_peek(&sk->write_queue));
+}
 
 /* Linux NewReno/SACK/FACK/ECN state machine.
  * --------------------------------------
@@ -1157,7 +1167,13 @@
 	if (tcp_fackets_out(tp) > tp->reordering)
 		return 1;
 
-	/* Trick#3: It is still not OK... But will it be useful to delay
+	/* Trick#3 : when we use RFC2988 timer restart, fast
+	 * retransmit can be triggered by timeout of queue head.
+	 */
+	if (tcp_head_timedout(sk, tp))
+		return 1;
+
+	/* Trick#4: It is still not OK... But will it be useful to delay
 	 * recovery more?
 	 */
 	if (tp->packets_out <= tp->reordering &&
@@ -1178,8 +1194,10 @@
  */
 static void tcp_check_reno_reordering(struct tcp_opt *tp, int addend)
 {
-	if (tp->sacked_out + 1 > tp->packets_out) {
-		tp->sacked_out = tp->packets_out ? tp->packets_out - 1 : 0;
+	int holes = min(max(tp->lost_out, 1), tp->packets_out);
+
+	if (tp->sacked_out + holes > tp->packets_out) {
+		tp->sacked_out = tp->packets_out - holes;
 		tcp_update_reordering(tp, tp->packets_out+addend, 0);
 	}
 }
@@ -1190,7 +1208,7 @@
 {
 	++tp->sacked_out;
 	tcp_check_reno_reordering(tp, 0);
-	tp->left_out = tp->sacked_out + tp->lost_out;
+	tcp_sync_left_out(tp);
 }
 
 /* Account for ACK, ACKing some data in Reno Recovery phase. */
@@ -1198,17 +1216,14 @@
 static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_opt *tp, int acked)
 {
 	if (acked > 0) {
-		/* One ACK eated lost packet. Must eat! */
-		BUG_TRAP(tp->lost_out == 0);
-
-		/* The rest eat duplicate ACKs. */
+		/* One ACK acked hole. The rest eat duplicate ACKs. */
 		if (acked-1 >= tp->sacked_out)
 			tp->sacked_out = 0;
 		else
 			tp->sacked_out -= acked-1;
 	}
 	tcp_check_reno_reordering(tp, acked);
-	tp->left_out = tp->sacked_out + tp->lost_out;
+	tcp_sync_left_out(tp);
 }
 
 static inline void tcp_reset_reno_sack(struct tcp_opt *tp)
@@ -1234,7 +1249,7 @@
 			tp->lost_out++;
 		}
 	}
-	tp->left_out = tp->sacked_out + tp->lost_out;
+	tcp_sync_left_out(tp);
 }
 
 /* Account newly detected lost packet(s) */
@@ -1249,6 +1264,24 @@
 	} else {
 		tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
 	}
+
+	/* New heuristics: it is possible only after we switched
+	 * to restart timer each time when something is ACKed.
+	 * Hence, we can detect timed out packets during fast
+	 * retransmit without falling to slow start.
+	 */
+	if (tcp_head_timedout(sk, tp)) {
+		struct sk_buff *skb;
+
+		for_retrans_queue(skb, sk, tp) {
+			if (tcp_skb_timedout(tp, skb) &&
+			    !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+				tp->lost_out++;
+			}
+		}
+		tcp_sync_left_out(tp);
+	}
 }
 
 /* CWND moderation, preventing bursts due to too big ACKs
@@ -1490,7 +1523,7 @@
 	}
 
 	/* D. Synchronize left_out to current state. */
-	tp->left_out = tp->sacked_out + tp->lost_out;
+	tcp_sync_left_out(tp);
 
 	/* E. Check state exit conditions. State can be terminated
 	 *    when high_seq is ACKed. */
@@ -1516,8 +1549,13 @@
 
 		case TCP_CA_Disorder:
 			tcp_try_undo_dsack(sk, tp);
-			tp->undo_marker = 0;
-			tp->ca_state = TCP_CA_Open;
+			if (!tp->undo_marker ||
+			    /* For SACK case do not Open to allow to undo
+			     * catching for all duplicate ACKs. */
+			    IsReno(tp) || tp->snd_una != tp->high_seq) {
+				tp->undo_marker = 0;
+				tp->ca_state = TCP_CA_Open;
+			}
 			break;
 
 		case TCP_CA_Recovery:
@@ -1544,8 +1582,8 @@
 		}
 		break;
 	case TCP_CA_Loss:
-		if (flag & FLAG_ACKED)
-			tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+		if (flag&FLAG_DATA_ACKED)
+			tp->retransmits = 0;
 		if (!tcp_try_undo_loss(sk, tp)) {
 			tcp_moderate_cwnd(tp);
 			tcp_xmit_retransmit_queue(sk);
@@ -1593,7 +1631,7 @@
 		tp->ca_state = TCP_CA_Recovery;
 	}
 
-	if (is_dupack)
+	if (is_dupack || tcp_head_timedout(sk, tp))
 		tcp_update_scoreboard(sk, tp);
 	tcp_cwnd_down(tp);
 	tcp_xmit_retransmit_queue(sk);
@@ -1613,16 +1651,18 @@
 	 *
 	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
 	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+	 *
+	 * Changed: reset backoff as soon as we see the first valid sample.
+	 * If we do not, we get strongly overstimated rto. With timestamps
+	 * samples are accepted even from very old segments: f.e., when rtt=1
+	 * increases to 8, we retransmit 5 times and after 8 seconds delayed
+	 * answer arrives rto becomes 120 seconds! If at least one of segments
+	 * in window is lost... Voila.	 			--ANK (010210)
 	 */
 	seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
 	tcp_rtt_estimator(tp, seq_rtt);
 	tcp_set_rto(tp);
-	if (tp->backoff) {
-		if (!tp->retransmits || !(flag & FLAG_RETRANS_DATA_ACKED))
-			tp->backoff = 0;
-		else
-			tp->rto <<= tp->backoff;
-	}
+	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
@@ -1642,15 +1682,7 @@
 
 	tcp_rtt_estimator(tp, seq_rtt);
 	tcp_set_rto(tp);
-	if (tp->backoff) {
-		/* To relax it? We have valid sample as soon as we are
-		 * here. Why not to clear backoff?
-		 */
-		if (!tp->retransmits)
-			tp->backoff = 0;
-		else
-			tp->rto <<= tp->backoff;
-	}
+	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
@@ -1684,15 +1716,11 @@
 		} else
 			tp->snd_cwnd_cnt++;
         }
+	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
 /* Restart timer after forward progress on connection.
- * RFC2988 recommends (and BSD does) to restart timer to now+rto,
- * which is certainly wrong and effectively means that
- * rto includes one more _full_ rtt.
- *
- * For details see:
- * 	ftp://ftp.inr.ac.ru:/ip-routing/README.rto
+ * RFC2988 recommends to restart timer to now+rto.
  */
 
 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
@@ -1700,12 +1728,7 @@
 	if (tp->packets_out==0) {
 		tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
 	} else {
-		struct sk_buff *skb = skb_peek(&sk->write_queue);
-		__u32 when = tp->rto + tp->rttvar - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
-
-		if ((__s32)when < (__s32)tp->rttvar)
-			when = tp->rttvar;
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, min(when, TCP_RTO_MAX));
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 	}
 }
 
@@ -1857,12 +1880,7 @@
 			/* Note, it is the only place, where
 			 * fast path is recovered for sending TCP.
 			 */
-			if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
-#ifdef TCP_FORMAL_WINDOW
-			    tcp_receive_window(tp) &&
-#endif
-			    !tp->urg_data)
-				tcp_fast_path_on(tp);
+			tcp_fast_path_check(sk, tp);
 
 			if (nwin > tp->max_window) {
 				tp->max_window = nwin;
@@ -1873,16 +1891,6 @@
 
 	tp->snd_una = ack;
 
-#ifdef TCP_DEBUG
-	if (before(tp->snd_una + tp->snd_wnd, tp->snd_nxt)) {
-		if (tp->snd_nxt-(tp->snd_una + tp->snd_wnd) >= (1<<tp->snd_wscale)
-		    && net_ratelimit())
-			printk(KERN_DEBUG "TCP: peer %u.%u.%u.%u:%u/%u shrinks window %u:%u:%u. Bad, what else can I say?\n",
-			       NIPQUAD(sk->daddr), htons(sk->dport), sk->num,
-			       tp->snd_una, tp->snd_wnd, tp->snd_nxt);
-	}
-#endif
-
 	return flag;
 }
 
@@ -2224,7 +2232,6 @@
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	tp->fin_seq = TCP_SKB_CB(skb)->end_seq;
 	tcp_schedule_ack(tp);
 
 	sk->shutdown |= RCV_SHUTDOWN;
@@ -2506,10 +2513,27 @@
 	}
 }
 
+static inline int tcp_rmem_schedule(struct sock *sk, struct sk_buff *skb)
+{
+	return (int)skb->truesize <= sk->forward_alloc ||
+		tcp_mem_schedule(sk, skb->truesize, 1);
+}
+
+static int tcp_prune_queue(struct sock *sk);
+
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
+	struct tcphdr *th = skb->h.th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	int eaten = 0;
+	int eaten = -1;
+
+	th = skb->h.th;
+	__skb_pull(skb, th->doff*4);
+
+        if (skb->len == 0 && !th->fin)
+		goto drop;
+
+	TCP_ECN_accept_cwr(tp, skb);
 
 	if (tp->dsack) {
 		tp->dsack = 0;
@@ -2535,26 +2559,32 @@
 			__set_current_state(TASK_RUNNING);
 
 			local_bh_enable();
-			if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) {
+			if (skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
 				sk->err = EFAULT;
 				sk->error_report(sk);
 			}
 			local_bh_disable();
 			tp->ucopy.len -= chunk;
 			tp->copied_seq += chunk;
-			eaten = (chunk == skb->len && !skb->h.th->fin);
+			eaten = (chunk == skb->len && !th->fin);
 		}
 
-		if (!eaten) {
+		if (eaten <= 0) {
 queue_and_out:
+			if (eaten < 0 &&
+			    (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
+			     !tcp_rmem_schedule(sk, skb))) {
+				if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb))
+					goto drop;
+			}
 			tcp_set_owner_r(skb, sk);
 			__skb_queue_tail(&sk->receive_queue, skb);
 		}
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->len)
 			tcp_event_data_recv(sk, tp, skb);
-		if(skb->h.th->fin)
-			tcp_fin(skb, sk, skb->h.th);
+		if(th->fin)
+			tcp_fin(skb, sk, th);
 
 		if (skb_queue_len(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
@@ -2569,15 +2599,9 @@
 		if(tp->num_sacks)
 			tcp_sack_remove(tp);
 
-		/* Turn on fast path. */ 
-		if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
-#ifdef TCP_FORMAL_WINDOW
-		    tcp_receive_window(tp) &&
-#endif
-		    !tp->urg_data)
-			tcp_fast_path_on(tp);
+		tcp_fast_path_check(sk, tp);
 
-		if (eaten) {
+		if (eaten > 0) {
 			__kfree_skb(skb);
 		} else if (!sk->dead)
 			sk->data_ready(sk, 0);
@@ -2592,17 +2616,12 @@
 
 out_of_window:
 		tcp_schedule_ack(tp);
+drop:
 		__kfree_skb(skb);
 		return;
 	}
 
-	/* Out of window. F.e. zero window probe.
-	 *
-	 * Note: it is highly possible that we may open window and enqueue
-	 * this segment now. However, this will be known only after we queue
-	 * it, which will result in queue full of successive 1 byte BSD
-	 * window probes, it is SWS in fact. So, always reject it and send ACK.
-	 */
+	/* Out of window. F.e. zero window probe. */
 	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt+tcp_receive_window(tp)))
 		goto out_of_window;
 
@@ -2626,6 +2645,12 @@
 
 	TCP_ECN_check_ce(tp, skb);
 
+	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
+	    !tcp_rmem_schedule(sk, skb)) {
+		if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb))
+			goto drop;
+	}
+
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
 	tcp_schedule_ack(tp);
@@ -2704,52 +2729,142 @@
 	}
 }
 
-
-static void tcp_collapse_queue(struct sock *sk, struct sk_buff_head *q)
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff *head,
+	     struct sk_buff *tail, u32 start, u32 end)
 {
-	struct sk_buff *skb = skb_peek(q);
-	struct sk_buff *skb_next;
+	struct sk_buff *skb;
 
-	while (skb &&
-	       skb != (struct sk_buff *)q &&
-	       (skb_next = skb->next) != (struct sk_buff *)q) {
-		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-		struct tcp_skb_cb *scb_next = TCP_SKB_CB(skb_next);
-
-		if (scb->end_seq == scb_next->seq &&
-		    skb_tailroom(skb) >= skb_next->len &&
-#define TCP_DONT_COLLAPSE (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN)
-		    !(tcp_flag_word(skb->h.th)&TCP_DONT_COLLAPSE) &&
-		    !(tcp_flag_word(skb_next->h.th)&TCP_DONT_COLLAPSE)) {
-			/* OK to collapse two skbs to one */
-			memcpy(skb_put(skb, skb_next->len), skb_next->data, skb_next->len);
-			__skb_unlink(skb_next, skb_next->list);
-			scb->end_seq = scb_next->end_seq;
-			__kfree_skb(skb_next);
+	/* First, check that queue is collapsable and find
+	 * the point where collapsing can be useful. */
+	for (skb = head; skb != tail; ) {
+		/* No new bits? It is possible on ofo queue. */
+		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+			struct sk_buff *next = skb->next;
+			__skb_unlink(skb, skb->list);
+			__kfree_skb(skb);
 			NET_INC_STATS_BH(TCPRcvCollapsed);
-		} else {
-			/* Lots of spare tailroom, reallocate this skb to trim it. */
-			if (tcp_win_from_space(skb->truesize) > skb->len &&
-			    skb_tailroom(skb) > sizeof(struct sk_buff) + 16) {
-				struct sk_buff *nskb;
-
-				nskb = skb_copy_expand(skb, skb_headroom(skb), 0, GFP_ATOMIC);
-				if (nskb) {
-					tcp_set_owner_r(nskb, sk);
-					memcpy(nskb->data-skb_headroom(skb),
-					       skb->data-skb_headroom(skb),
-					       skb_headroom(skb));
-					__skb_append(skb, nskb);
-					__skb_unlink(skb, skb->list);
-					__kfree_skb(skb);
-				}
+			skb = next;
+			continue;
+		}
+
+		/* The first skb to collapse is:
+		 * - not SYN/FIN and
+		 * - bloated or contains data before "start" or
+		 *   overlaps to the next one.
+		 */
+		if (!skb->h.th->syn && !skb->h.th->fin &&
+		    (tcp_win_from_space(skb->truesize) > skb->len ||
+		     before(TCP_SKB_CB(skb)->seq, start) ||
+		     (skb->next != tail &&
+		      TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+			break;
+
+		/* Decided to skip this, advance start seq. */
+		start = TCP_SKB_CB(skb)->end_seq;
+		skb = skb->next;
+	}
+	if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+		return;
+
+	while (before(start, end)) {
+		struct sk_buff *nskb;
+		int header = skb_headroom(skb);
+		int copy = (PAGE_SIZE - sizeof(struct sk_buff) -
+			    sizeof(struct skb_shared_info) - header - 31)&~15;
+
+		/* Too big header? This can happen with IPv6. */
+		if (copy < 0)
+			return;
+		if (end-start < copy)
+			copy = end-start;
+		nskb = alloc_skb(copy+header, GFP_ATOMIC);
+		if (!nskb)
+			return;
+		skb_reserve(nskb, header);
+		memcpy(nskb->head, skb->head, header);
+		nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
+		nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
+		nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+		__skb_insert(nskb, skb->prev, skb, skb->list);
+		tcp_set_owner_r(nskb, sk);
+
+		/* Copy data, releasing collapsed skbs. */
+		while (copy > 0) {
+			int offset = start - TCP_SKB_CB(skb)->seq;
+			int size = TCP_SKB_CB(skb)->end_seq - start;
+
+			if (offset < 0) BUG();
+			if (size > 0) {
+				size = min(copy, size);
+				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+					BUG();
+				TCP_SKB_CB(nskb)->end_seq += size;
+				copy -= size;
+				start += size;
+			}
+			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+				struct sk_buff *next = skb->next;
+				__skb_unlink(skb, skb->list);
+				__kfree_skb(skb);
+				NET_INC_STATS_BH(TCPRcvCollapsed);
+				skb = next;
+				if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+					return;
 			}
-			skb = skb_next;
 		}
 	}
 }
 
-/* Clean the out_of_order queue if we can, trying to get
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+	struct sk_buff *head;
+	u32 start, end;
+
+	if (skb == NULL)
+		return;
+
+	start = TCP_SKB_CB(skb)->seq;
+	end = TCP_SKB_CB(skb)->end_seq;
+	head = skb;
+
+	for (;;) {
+		skb = skb->next;
+
+		/* Segment is terminated when we see gap or when
+		 * we are at the end of all the queue. */
+		if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+		    after(TCP_SKB_CB(skb)->seq, end) ||
+		    before(TCP_SKB_CB(skb)->end_seq, start)) {
+			tcp_collapse(sk, head, skb, start, end);
+			head = skb;
+			if (skb == (struct sk_buff *)&tp->out_of_order_queue)
+				break;
+			/* Start new segment */
+			start = TCP_SKB_CB(skb)->seq;
+			end = TCP_SKB_CB(skb)->end_seq;
+		} else {
+			if (before(TCP_SKB_CB(skb)->seq, start))
+				start = TCP_SKB_CB(skb)->seq;
+			if (after(TCP_SKB_CB(skb)->end_seq, end))
+				end = TCP_SKB_CB(skb)->end_seq;
+		}
+	}
+}
+
+/* Reduce allocated memory if we can, trying to get
  * the socket within its memory limits again.
  *
  * Return less than zero if we should start dropping frames
@@ -2769,8 +2884,10 @@
 	else if (tcp_memory_pressure)
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss);
 
-	tcp_collapse_queue(sk, &sk->receive_queue);
-	tcp_collapse_queue(sk, &tp->out_of_order_queue);
+	tcp_collapse_ofo_queue(sk);
+	tcp_collapse(sk, sk->receive_queue.next,
+		     (struct sk_buff*)&sk->receive_queue,
+		     tp->copied_seq, tp->rcv_nxt);
 	tcp_mem_reclaim(sk);
 
 	if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
@@ -2804,59 +2921,10 @@
 	NET_INC_STATS_BH(RcvPruned);
 
 	/* Massive buffer overcommit. */
+	tp->pred_flags = 0;
 	return -1;
 }
 
-static inline int tcp_rmem_schedule(struct sock *sk, struct sk_buff *skb)
-{
-	return (int)skb->truesize <= sk->forward_alloc ||
-		tcp_mem_schedule(sk, skb->truesize, 1);
-}
-
-/*
- *	This routine handles the data.  If there is room in the buffer,
- *	it will be have already been moved into it.  If there is no
- *	room, then we will just have to discard the packet.
- */
-
-static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
-{
-	struct tcphdr *th;
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
-	th = skb->h.th;
-	skb_pull(skb, th->doff*4);
-	skb_trim(skb, len - (th->doff*4));
-
-        if (skb->len == 0 && !th->fin)
-		goto drop;
-
-	TCP_ECN_accept_cwr(tp, skb);
-
-	/* 
-	 *	If our receive queue has grown past its limits shrink it.
-	 *	Make sure to do this before moving rcv_nxt, otherwise
-	 *	data might be acked for that we don't have enough room.
-	 */
-	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
-	    !tcp_rmem_schedule(sk, skb)) {
-		if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb))
-			goto drop;
-	}
-
-	tcp_data_queue(sk, skb);
-
-#ifdef TCP_DEBUG
-	if (before(tp->rcv_nxt, tp->copied_seq)) {
-		printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
-		tp->rcv_nxt = tp->copied_seq;
-	}
-#endif
-	return;
-
-drop:
-	__kfree_skb(skb);
-}
 
 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
  * As additional protections, we do not touch cwnd in retransmission phases,
@@ -2902,22 +2970,7 @@
 		tp->snd_cwnd_stamp = tcp_time_stamp;
 	}
 
-	/* Wakeup users. */
-	if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
-		struct socket *sock = sk->socket;
-
-		clear_bit(SOCK_NOSPACE, &sock->flags);
-
-		if (sk->sleep && waitqueue_active(sk->sleep))
-			wake_up_interruptible(sk->sleep);
-
-		if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
-			sock_wake_async(sock, 2, POLL_OUT);
-
-		/* Satisfy those who hook write_space() callback. */
-		if (sk->write_space != tcp_write_space)
-			sk->write_space(sk);
-	}
+	sk->write_space(sk);
 }
 
 static inline void tcp_check_space(struct sock *sk)
@@ -2937,7 +2990,7 @@
 
 	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
 	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-	    tcp_write_xmit(sk))
+	    tcp_write_xmit(sk, tp->nonagle))
 		tcp_check_probe_timer(sk, tp);
 }
 
@@ -3009,6 +3062,19 @@
 	if (after(tp->copied_seq, ptr))
 		return;
 
+	/* Do not replay urg ptr.
+	 *
+	 * NOTE: interesting situation not covered by specs.
+	 * Misbehaving sender may send urg ptr, pointing to segment,
+	 * which we already have in ofo queue. We are not able to fetch
+	 * such data and will stay in TCP_URG_NOTYET until will be eaten
+	 * by recvmsg(). Seems, we are not obliged to handle such wicked
+	 * situations. But it is worth to think about possibility of some
+	 * DoSes using some hypothetical application level deadlock.
+	 */
+	if (before(ptr, tp->rcv_nxt))
+		return;
+
 	/* Do we already have a newer (or duplicate) urgent pointer? */
 	if (tp->urg_data && !after(ptr, tp->urg_seq))
 		return;
@@ -3027,9 +3093,27 @@
 	 * tp->copied_seq since we would read the last urgent byte again
 	 * as data, nor can we alter copied_seq until this data arrives
 	 * or we break the sematics of SIOCATMARK (and thus sockatmark())
-	 */
-	if (tp->urg_seq == tp->copied_seq)
-		tp->copied_seq++;	/* Move the copied sequence on correctly */
+	 *
+	 * NOTE. Double Dutch. Rendering to plain English: author of comment
+	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
+	 * and expect that both A and B disappear from stream. This is _wrong_.
+	 * Though this happens in BSD with high probability, this is occasional.
+	 * Any application relying on this is buggy. Note also, that fix "works"
+	 * only in this artificial test. Insert some normal data between A and B and we will
+	 * decline of BSD again. Verdict: it is better to remove to trap
+	 * buggy users.
+	 */
+	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+	    !sk->urginline &&
+	    tp->copied_seq != tp->rcv_nxt) {
+		struct sk_buff *skb = skb_peek(&sk->receive_queue);
+		tp->copied_seq++;
+		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+			__skb_unlink(skb, skb->list);
+			__kfree_skb(skb);
+		}
+	}
+
 	tp->urg_data = TCP_URG_NOTYET;
 	tp->urg_seq = ptr;
 
@@ -3038,7 +3122,7 @@
 }
 
 /* This is the 'fast' part of urgent handling. */
-static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
+static inline void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
@@ -3048,11 +3132,14 @@
 
 	/* Do we wait for any urgent data? - normally not... */
 	if (tp->urg_data == TCP_URG_NOTYET) {
-		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
+		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4) - th->syn;
 
 		/* Is the urgent pointer pointing into this packet? */	 
-		if (ptr < len) {
-			tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th);
+		if (ptr < skb->len) {
+			u8 tmp;
+			if (skb_copy_bits(skb, ptr, &tmp, 1))
+				BUG();
+			tp->urg_data = TCP_URG_VALID | tmp;
 			if (!sk->dead)
 				sk->data_ready(sk,0);
 		}
@@ -3067,9 +3154,9 @@
 
 	local_bh_enable();
 	if (skb->ip_summed==CHECKSUM_UNNECESSARY)
-		err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk);
+		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
 	else
-		err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen);
+		err = skb_copy_and_csum_datagram_iovec(skb, hlen, tp->ucopy.iov);
 
 	if (!err) {
 update:
@@ -3117,32 +3204,6 @@
  * 	disabled when:
  *	- A zero window was announced from us - zero window probing
  *        is only handled properly in the slow path. 
- *	  [ NOTE: actually, it was made incorrectly and nobody ever noticed
- *	    this! Reason is clear: 1. Correct senders do not send
- *	    to zero window. 2. Even if a sender sends to zero window,
- *	    nothing terrible occurs.
- *
- *	    For now I cleaned this and fast path is really always disabled,
- *	    when window is zero, but I would be more happy to remove these
- *	    checks. Code will be only cleaner and _faster_.    --ANK
- *	
- *	    Later note. I've just found that slow path also accepts
- *	    out of window segments, look at tcp_sequence(). So...
- *	    it is the last argument: I repair all and comment out
- *	    repaired code by TCP_FORMAL_WINDOW.
- *	    [ I remember one rhyme from a chidren's book. (I apologize,
- *	      the trasnlation is not rhymed 8)): people in one (jewish) village
- *	      decided to build sauna, but divided to two parties.
- *	      The first one insisted that battens should not be dubbed,
- *	      another objected that foots will suffer of splinters,
- *	      the first fended that dubbed wet battens are too slippy
- *	      and people will fall and it is much more serious!
- *	      Certaiinly, all they went to rabbi.
- *	      After some thinking, he judged: "Do not be lazy!
- *	      Certainly, dub the battens! But put them by dubbed surface down."
- *          ]
- *        ]
- *
  *	- Out of order segments arrived.
  *	- Urgent data is expected.
  *	- There is no buffer space left
@@ -3348,7 +3409,7 @@
 
 	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
 
-	if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
+	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		TCP_INC_STATS_BH(TcpInErrs);
 		NET_INC_STATS_BH(TCPAbortOnSyn);
 		tcp_reset(sk);
@@ -3360,10 +3421,10 @@
 		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
 	/* Process urgent data. */
-	tcp_urg(sk, th, len);
+	tcp_urg(sk, skb, th);
 
 	/* step 7: process the segment text */
-	tcp_data(skb, sk, len);
+	tcp_data_queue(sk, skb);
 
 	tcp_data_snd_check(sk);
 	tcp_ack_snd_check(sk);
@@ -3452,8 +3513,6 @@
 		 */
 		tp->snd_wnd = ntohs(th->window);
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
-		tp->syn_seq = TCP_SKB_CB(skb)->seq;
-		tp->fin_seq = TCP_SKB_CB(skb)->seq;
 
 		if (tp->wscale_ok == 0) {
 			tp->snd_wscale = tp->rcv_wscale = 0;
@@ -3488,7 +3547,7 @@
 
 		/* Remember, tcp_poll() does not lock socket!
 		 * Change state from SYN-SENT only after copied_seq
-		 * is initilized. */
+		 * is initialized. */
 		tp->copied_seq = tp->rcv_nxt;
 		mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
@@ -3498,7 +3557,7 @@
 			sk_wake_async(sk, 0, POLL_OUT);
 		}
 
-		if (tp->write_pending || tp->defer_accept) {
+		if (tp->write_pending || tp->defer_accept || tp->ack.pingpong) {
 			/* Save one ACK. Data will be ready after
 			 * several ticks, if write_pending is set.
 			 *
@@ -3508,6 +3567,8 @@
 			 */
 			tcp_schedule_ack(tp);
 			tp->ack.lrcvtime = tcp_time_stamp;
+			tp->ack.ato = TCP_ATO_MIN;
+			tcp_incr_quickack(tp);
 			tcp_enter_quickack_mode(tp);
 			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
 
@@ -3683,21 +3744,9 @@
 
 	/*	step 4:
 	 *
-	 *	Check for a SYN, and ensure it matches the SYN we were
-	 *	first sent. We have to handle the rather unusual (but valid)
-	 *	sequence that KA9Q derived products may generate of
-	 *
-	 *	SYN
-	 *				SYN|ACK Data
-	 *	ACK	(lost)
-	 *				SYN|ACK Data + More Data
-	 *	.. we must ACK not RST...
-	 *
-	 *	We keep syn_seq as the sequence space occupied by the 
-	 *	original syn. 
+	 *	Check for a SYN in window.
 	 */
-
-	if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
+	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(TCPAbortOnSyn);
 		tcp_reset(sk);
 		return 1;
@@ -3713,6 +3762,7 @@
 				tp->copied_seq = tp->rcv_nxt;
 				mb();
 				tcp_set_state(sk, TCP_ESTABLISHED);
+				sk->state_change(sk);
 
 				/* Note, that this wakeup is only for marginal
 				 * crossed SYN case. Passively open sockets
@@ -3720,7 +3770,6 @@
 				 * and sk->socket == NULL.
 				 */
 				if (sk->socket) {
-					sk->state_change(sk);
 					sk_wake_async(sk,0,POLL_OUT);
 				}
 
@@ -3806,13 +3855,13 @@
 
 step6:
 	/* step 6: check the URG bit */
-	tcp_urg(sk, th, len);
+	tcp_urg(sk, skb, th);
 
 	/* step 7: process the segment text */
 	switch (sk->state) {
 	case TCP_CLOSE_WAIT:
 	case TCP_CLOSING:
-		if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
+		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 	case TCP_FIN_WAIT1:
 	case TCP_FIN_WAIT2:
@@ -3830,7 +3879,7 @@
 		}
 		/* Fall through */
 	case TCP_ESTABLISHED: 
-		tcp_data(skb, sk, len);
+		tcp_data_queue(sk, skb);
 		queued = 1;
 		break;
 	}

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)