patch-2.1.90 linux/net/ipv4/tcp_input.c
Next file: linux/net/ipv4/tcp_ipv4.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index
- Lines: 1344
- Date:
Mon Mar 16 10:36:01 1998
- Orig file:
v2.1.89/linux/net/ipv4/tcp_input.c
- Orig date:
Tue Mar 10 10:03:37 1998
diff -u --recursive --new-file v2.1.89/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.66 1998/01/15 22:40:29 freitag Exp $
+ * Version: $Id: tcp_input.c,v 1.84 1998/03/15 03:23:20 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -67,57 +67,54 @@
extern int sysctl_tcp_fin_timeout;
+/* These are on by default so the code paths get tested.
+ * For the final 2.2 this may be undone at our discretion. -DaveM
+ */
+int sysctl_tcp_timestamps = 1;
+int sysctl_tcp_window_scaling = 1;
+
int sysctl_tcp_cong_avoidance;
int sysctl_tcp_hoe_retransmits;
-int sysctl_tcp_sack;
-int sysctl_tcp_tsack;
-int sysctl_tcp_timestamps;
-int sysctl_tcp_window_scaling;
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_stdurg;
+int sysctl_tcp_rfc1337;
static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
-/*
- * Called each time to estimate the delayed ack timeout. This is
- * how it should be done so a fast link isnt impacted by ack delay.
- *
- * I think we need a medium deviation here also...
- * The estimated value is changing to fast
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval. When a
+ * connection starts up, we want to ack as quickly as possible. The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission. The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time. For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue. -DaveM
*/
-
static void tcp_delack_estimator(struct tcp_opt *tp)
{
- int m;
-
- /* Delayed ACK time estimator. */
-
- m = jiffies - tp->lrcvtime;
-
- tp->lrcvtime = jiffies;
+ if(tp->ato == 0) {
+ tp->lrcvtime = jiffies;
- if (m < 0)
- return;
-
- /* if the mesured value is bigger than
- * twice the round trip time ignore it.
- */
- if ((m << 2) <= tp->srtt) {
- m -= (tp->iat >> 3);
- tp->iat += m;
-
- if (m <0)
- m = -m;
-
- m -= (tp->iat_mdev >> 2);
- tp->iat_mdev += m;
+ /* Help sender leave slow start quickly,
+ * this sets our initial ato value.
+ */
+ tcp_enter_quickack_mode(tp);
+ } else {
+ int m = jiffies - tp->lrcvtime;
- tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2);
+ tp->lrcvtime = jiffies;
+ if(m <= 0)
+ m = 1;
+ if(m > tp->rto)
+ tp->ato = tp->rto;
+ else
+ tp->ato = (tp->ato >> 1) + m;
- if (tp->ato < HZ/50)
- tp->ato = HZ/50;
- } else
- tp->ato = 0;
+ /* We are not in "quick ack" mode. */
+ if(tp->ato <= (HZ/100))
+ tp->ato = ((HZ/100)*2);
+ }
}
/* Called to compute a smoothed rtt estimate. The data fed to this
@@ -132,9 +129,9 @@
static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
{
- long m;
- /*
- * The following amusing code comes from Jacobson's
+ long m = mrtt; /* RTT */
+
+ /* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
* This is designed to be as fast as possible
@@ -143,12 +140,9 @@
* On a 1990 paper the rto value is changed to:
* RTO = rtt + 4 * mdev
*/
-
- m = mrtt; /* RTT */
-
+ if(m == 0)
+ m = 1;
if (tp->srtt != 0) {
- if(m<=0)
- m=1; /* IS THIS RIGHT FOR <0 ??? */
m -= (tp->srtt >> 3); /* m is now error in rtt est */
tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0)
@@ -202,19 +196,17 @@
*/
if (!before(end_seq,tp->last_ack_sent)) {
tp->ts_recent = tp->rcv_tsval;
- /* FIXME: need a corse timestamp. Days uptime
- * would be good.
- */
tp->ts_recent_stamp = jiffies;
}
}
+#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
+
extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
{
- /* FIXME: must check that ts_recent is not
- * more than 24 days old here. Yuck.
- */
- return ((s32)(tp->rcv_tsval-tp->ts_recent) < 0);
+ /* ts_recent must be younger than 24 days */
+ return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
+ ((s32)(tp->rcv_tsval-tp->ts_recent) < 0));
}
@@ -257,8 +249,6 @@
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->state) {
- case TCP_TIME_WAIT:
- break;
case TCP_SYN_SENT:
sk->err = ECONNREFUSED;
break;
@@ -268,23 +258,8 @@
default:
sk->err = ECONNRESET;
};
-#ifdef CONFIG_TCP_RFC1337
- /*
- * Time wait assassination protection [RFC1337]
- *
- * This is a good idea, but causes more sockets to take time to close.
- *
- * Ian Heavens has since shown this is an inadequate fix for the protocol
- * bug in question.
- */
- if(sk->state!=TCP_TIME_WAIT) {
- tcp_set_state(sk,TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- }
-#else
tcp_set_state(sk,TCP_CLOSE);
sk->shutdown = SHUTDOWN_MASK;
-#endif
if (!sk->dead)
sk->state_change(sk);
}
@@ -302,7 +277,6 @@
int length=(th->doff*4)-sizeof(struct tcphdr);
ptr = (unsigned char *)(th + 1);
- tp->sacks = 0;
tp->saw_tstamp = 0;
while(length>0) {
@@ -336,10 +310,6 @@
tp->snd_wscale = *(__u8 *)ptr;
}
break;
- case TCPOPT_SACK_PERM:
- if(opsize==TCPOLEN_SACK_PERM && th->syn)
- if (sysctl_tcp_sack && !no_fancy)
- tp->sack_ok = 1;
case TCPOPT_TIMESTAMP:
if(opsize==TCPOLEN_TIMESTAMP) {
/* Cheaper to set again then to
@@ -353,18 +323,6 @@
}
}
break;
- case TCPOPT_SACK:
- if (no_fancy || !sysctl_tcp_sack)
- break;
- tp->sacks = (opsize-2)>>3;
- if (tp->sacks<<3 == opsize-2) {
- int i;
- for (i = 0; i < tp->sacks; i++) {
- tp->left_sack[i] = ntohl(((__u32 *)ptr)[2*i]);
- tp->right_sack[i] = ntohl(((__u32 *)ptr)[2*i+1]);
- }
- } else
- tp->sacks = 0;
}
ptr+=opsize-2;
length-=opsize;
@@ -374,7 +332,7 @@
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_option().
- * This should probably get extended for timestamps + SACK as well.
+ * This should probably get extended for timestamps as well.
* Assembly code anyone? -- erics
*/
static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp)
@@ -384,14 +342,12 @@
return 0;
if (th->doff == sizeof(struct tcphdr)>>2) {
tp->saw_tstamp = 0;
- tp->sacks = 0;
return 0;
- } else if (th->doff == (sizeof(struct tcphdr)>>2)+3) {
+ } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
__u32 *ptr = (__u32 *)(th + 1);
- if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->saw_tstamp = 1;
- tp->sacks = 0;
tp->rcv_tsval = ntohl(*++ptr);
tp->rcv_tsecr = ntohl(*++ptr);
return 1;
@@ -401,89 +357,6 @@
return 1;
}
-#if 0
-
-/*
- * This is the old fast retransmit code. It will go away eventually. -- erics
- */
-
-/*
- * See draft-stevens-tcpca-spec-01 for documentation.
- */
-
-static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
-{
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
-
- /* FIXME: if we are already retransmitting should this code
- * be skipped? [Floyd high_seq check sort of does this]
- * The case I'm worried about is falling into a fast
- * retransmit on a link with a congestion window of 1 or 2.
- * There was some evidence in 2.0.x that this was problem
- * on really slow links (1200 or 2400 baud). I need to
- * try this situation again and see what happens.
- */
-
- /*
- * An ACK is a duplicate if:
- * (1) it has the same sequence number as the largest number we've
- * seen,
- * (2) it has the same window as the last ACK,
- * (3) we have outstanding data that has not been ACKed
- * (4) The packet was not carrying any data.
- * (5) [From Floyds paper on fast retransmit wars]
- * The packet acked data after high_seq;
- */
-
- if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
- /* 1. When the third duplicate ack is received, set ssthresh
- * to one half the current congestion window, but no less
- * than two segments. Retransmit the missing segment.
- */
- if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
- tp->dup_acks++;
-
- if (tp->dup_acks == 3) {
- tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
- tp->snd_cwnd = tp->snd_ssthresh + 3;
- tcp_do_retransmit(sk, 0);
-
- /* Careful not to timeout just after fast
- * retransmit!
- */
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
- }
- }
-
- /* 2. Each time another duplicate ACK arrives, increment
- * cwnd by the segment size. [...] Transmit a packet...
- *
- * Packet transmission will be done on normal flow processing
- * since we're not in "retransmit mode".
- */
- if (tp->dup_acks >= 3) {
- tp->dup_acks++;
- tp->snd_cwnd++;
- }
- } else {
- /* 3. When the next ACK arrives that acknowledges new data,
- * set cwnd to ssthresh.
- */
- if (tp->dup_acks >= 3) {
- tp->retrans_head = NULL;
- tp->snd_cwnd = max(tp->snd_ssthresh, 1);
- tp->retransmits = 0;
- }
- tp->dup_acks = 0;
-
- /* FIXME: This is wrong if the new ack that arrives
- * is below the value for high_seq.
- */
- tp->high_seq = 0;
- }
-}
-#endif
-
#define FLAG_DATA 0x01
#define FLAG_WIN_UPDATE 0x02
#define FLAG_DATA_ACKED 0x04
@@ -579,9 +452,8 @@
* not indicate a packet left the system.
* We can test this by just checking
* if ack changed from snd_una, since
- * the only way to get here without changing
- * advancing from snd_una is if this was a
- * window update.
+ * the only way to get here without advancing
+ * from snd_una is if this was a window update.
*/
if (ack != tp->snd_una && before(ack,tp->high_seq)) {
tcp_do_retransmit(sk, 0);
@@ -596,9 +468,6 @@
clear_fast_retransmit(sk);
}
}
- } else {
- /* Clear any aborted fast retransmit starts. */
- tp->dup_acks = 0;
}
}
@@ -649,7 +518,6 @@
expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;
- /* XXX sk->mss should move into tcp_opt as well -DaveM */
inv_basebd = sk->mss * inv_basertt;
/* Slow Start */
@@ -731,13 +599,6 @@
int acked = 0;
while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
-#ifdef TCP_DEBUG
- /* Check for a bug. */
- if (skb->next != (struct sk_buff*) &sk->write_queue &&
- after(skb->end_seq, skb->next->seq))
- printk(KERN_DEBUG "INET: tcp_input.c: *** "
- "bug send_list out of order.\n");
-#endif
/* If our packet is before the ack sequence we can
* discard it as it's confirmed to have arrived the
* other end.
@@ -745,12 +606,15 @@
if (after(skb->end_seq, ack))
break;
-#if 0
- SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n",
- skb->seq, skb->end_seq);
-#endif
-
- acked = FLAG_DATA_ACKED;
+ /* Initial outgoing SYN's get put onto the write_queue
+ * just like anything else we transmit. It is not
+ * true data, and if we misinform our callers that
+ * this ACK acks real data, we will erroneously exit
+ * connection startup slow start one packet too
+ * quickly. This is severely frowned upon behavior.
+ */
+ if(!skb->h.th->syn)
+ acked = FLAG_DATA_ACKED;
/* FIXME: packet counting may break if we have to
* do packet "repackaging" for stacks that don't
@@ -766,11 +630,9 @@
kfree_skb(skb);
}
- if (acked) {
+ if (acked)
tp->retrans_head = NULL;
- if (!sk->dead)
- sk->write_space(sk);
- }
+
return acked;
}
@@ -795,6 +657,66 @@
}
}
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
+ u32 seq, u32 ack, int flag)
+{
+ __u32 seq_rtt = (jiffies-tp->rcv_tsecr);
+ tcp_rtt_estimator(tp, seq_rtt);
+ if (tp->retransmits) {
+ if (tp->packets_out == 0) {
+ tp->retransmits = 0;
+ tp->backoff = 0;
+ tcp_set_rto(tp);
+ } else {
+ /* Still retransmitting, use backoff */
+ tcp_set_rto(tp);
+ tp->rto = tp->rto << tp->backoff;
+ }
+ } else {
+ tcp_set_rto(tp);
+ if (flag & FLAG_DATA_ACKED)
+ (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+ }
+ /* NOTE: safe here so long as cong_ctl doesn't use rto */
+ tcp_bound_rto(tp);
+}
+
+static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
+{
+ struct sk_buff *skb;
+ long when;
+
+ skb = skb_peek(&sk->write_queue);
+ when = tp->rto - (jiffies - skb->when);
+
+ /* FIXME: This assumes that when we are retransmitting
+ * we should only ever respond with one packet.
+ * This means congestion windows should not grow
+ * during recovery. In 2.0.X we allow the congestion
+ * window to grow. It is not clear to me which
+ * decision is correct. The RFCs should be double
+ * checked as should the behavior of other stacks.
+ * Also note that if we do want to allow the
+ * congestion window to grow during retransmits
+ * we have to fix the call to congestion window
+ * updates so that it works during retransmission.
+ */
+ if (tp->retransmits) {
+ tp->retrans_head = NULL;
+
+ /* This is tricky. We are retransmiting a
+ * segment of a window when congestion occured.
+ */
+ tcp_do_retransmit(sk, 0);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ } else {
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
+ }
+}
+
/*
* This routine deals with incoming acks, but not outgoing ones.
*/
@@ -806,7 +728,6 @@
int flag = 0;
u32 seq = 0;
u32 seq_rtt = 0;
- struct sk_buff *skb;
if(sk->zapped)
return(1); /* Dead, can't ack any more so why bother */
@@ -838,7 +759,7 @@
*/
if (before(tp->snd_wl1, ack_seq) ||
(tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
- unsigned long nwin = ntohs(th->window) << tp->snd_wscale;
+ u32 nwin = ntohs(th->window) << tp->snd_wscale;
if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
flag |= FLAG_WIN_UPDATE;
@@ -869,28 +790,7 @@
/* If we have a timestamp, we always do rtt estimates. */
if (tp->saw_tstamp) {
- /* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Superceeds RFC1323)
- */
- seq_rtt = (jiffies-tp->rcv_tsecr);
- tcp_rtt_estimator(tp, seq_rtt);
- if (tp->retransmits) {
- if (tp->packets_out == 0) {
- tp->retransmits = 0;
- tp->backoff = 0;
- tcp_set_rto(tp);
- } else {
- /* Still retransmitting, use backoff */
- tcp_set_rto(tp);
- tp->rto = tp->rto << tp->backoff;
- }
- } else {
- tcp_set_rto(tp);
- if (flag & FLAG_DATA_ACKED)
- (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
- }
- /* NOTE: safe here so long as cong_ctl doesn't use rto */
- tcp_bound_rto(tp);
+ tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
} else {
/* If we were retransmiting don't count rtt estimate. */
if (tp->retransmits) {
@@ -916,51 +816,217 @@
}
if (tp->packets_out) {
- if (flag & FLAG_DATA_ACKED) {
- long when;
-
- skb = skb_peek(&sk->write_queue);
- when = tp->rto - (jiffies - skb->when);
-
- /* FIXME: This assumes that when we are retransmitting
- * we should only ever respond with one packet.
- * This means congestion windows should not grow
- * during recovery. In 2.0.X we allow the congestion
- * window to grow. It is not clear to me which
- * decision is correct. The RFCs should be double
- * checked as should the behavior of other stacks.
- * Also note that if we do want to allow the
- * congestion window to grow during retransmits
- * we have to fix the call to congestion window
- * updates so that it works during retransmission.
- */
- if (tp->retransmits) {
- tp->retrans_head = NULL;
-
- /* This is tricky. We are retransmiting a
- * segment of a window when congestion occured.
- */
- tcp_do_retransmit(sk, 0);
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
- } else
- tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
- }
- } else
+ if (flag & FLAG_DATA_ACKED)
+ tcp_ack_packets_out(sk, tp);
+ } else {
tcp_clear_xmit_timer(sk, TIME_RETRANS);
+ }
- tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE)));
-
+ flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
+ if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
+ (tp->high_seq != 0)) {
+ tcp_fast_retrans(sk, ack, flag);
+ } else {
+ /* Clear any aborted fast retransmit starts. */
+ tp->dup_acks = 0;
+ }
/* Remember the highest ack received. */
tp->snd_una = ack;
-
return 1;
uninteresting_ack:
-
SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
return 0;
}
+/* New-style handling of TIME_WAIT sockets. */
+static void tcp_timewait_kill(unsigned long __arg)
+{
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg;
+
+ /* Zap the timer. */
+ del_timer(&tw->timer);
+
+ /* Unlink from various places. */
+ if(tw->bind_next)
+ tw->bind_next->bind_pprev = tw->bind_pprev;
+ *(tw->bind_pprev) = tw->bind_next;
+ if(tw->tb->owners == NULL)
+ tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
+
+ if(tw->next)
+ tw->next->pprev = tw->pprev;
+ *tw->pprev = tw->next;
+
+ /* We decremented the prot->inuse count when we entered TIME_WAIT
+ * and the sock from which this came was destroyed.
+ */
+ tw->sklist_next->sklist_prev = tw->sklist_prev;
+ tw->sklist_prev->sklist_next = tw->sklist_next;
+
+ /* Ok, now free it up. */
+ kmem_cache_free(tcp_timewait_cachep, tw);
+}
+
+/* We come here as a special case from the AF specific TCP input processing,
+ * and the SKB has no owner. Essentially handling this is very simple,
+ * we just keep silently eating rx'd packets until none show up for the
+ * entire timeout period. The only special cases are for BSD TIME_WAIT
+ * reconnects and SYN/RST bits being set in the TCP header.
+ */
+int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+ struct tcphdr *th, void *opt, __u16 len)
+{
+ /* RFC 1122:
+ * "When a connection is [...] on TIME-WAIT state [...]
+ * [a TCP] MAY accept a new SYN from the remote TCP to
+ * reopen the connection directly, if it:
+ *
+ * (1) assigns its initial sequence number for the new
+ * connection to be larger than the largest sequence
+ * number it used on the previous connection incarnation,
+ * and
+ *
+ * (2) returns to TIME-WAIT state if the SYN turns out
+ * to be an old duplicate".
+ */
+ if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) {
+ struct sock *sk;
+ struct tcp_func *af_specific = tw->af_specific;
+ __u32 isn;
+
+ isn = tw->rcv_nxt + 128000;
+ if(isn == 0)
+ isn++;
+ tcp_timewait_kill((unsigned long)tw);
+ sk = af_specific->get_sock(skb, th);
+ if(sk == NULL || !ipsec_sk_policy(sk,skb))
+ return 0;
+ skb_set_owner_r(skb, sk);
+ af_specific = sk->tp_pinfo.af_tcp.af_specific;
+ if(af_specific->conn_request(sk, skb, opt, isn) < 0)
+ return 1; /* Toss a reset back. */
+ return 0; /* Discard the frame. */
+ }
+
+ /* Check RST or SYN */
+ if(th->rst || th->syn) {
+ /* This is TIME_WAIT assasination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if(sysctl_tcp_rfc1337 == 0)
+ tcp_timewait_kill((unsigned long)tw);
+
+ if(!th->rst)
+ return 1; /* toss a reset back */
+ } else {
+ if(th->ack) {
+ /* In this case we must reset the TIMEWAIT timer. */
+ del_timer(&tw->timer);
+ tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
+ add_timer(&tw->timer);
+ }
+ }
+ return 0; /* Discard the frame. */
+}
+
+/* Enter the time wait state. This is always called from BH
+ * context. Essentially we whip up a timewait bucket, copy the
+ * relevant info into it from the SK, and mess with hash chains
+ * and list linkage.
+ */
+static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+{
+ struct sock **head, *sktw;
+
+ /* Step 1: Remove SK from established hash. */
+ if(sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ tcp_reg_zap(sk);
+
+ /* Step 2: Put TW into bind hash where SK was. */
+ tw->tb = (struct tcp_bind_bucket *)sk->prev;
+ if((tw->bind_next = sk->bind_next) != NULL)
+ sk->bind_next->bind_pprev = &tw->bind_next;
+ tw->bind_pprev = sk->bind_pprev;
+ *sk->bind_pprev = (struct sock *)tw;
+
+ /* Step 3: Same for the protocol sklist. */
+ (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
+ (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
+ sk->sklist_next = NULL;
+ sk->prot->inuse--;
+
+ /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
+ head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
+ sktw = (struct sock *)tw;
+ if((sktw->next = *head) != NULL)
+ (*head)->pprev = &sktw->next;
+ *head = sktw;
+ sktw->pprev = head;
+}
+
+void tcp_time_wait(struct sock *sk)
+{
+ struct tcp_tw_bucket *tw;
+
+ tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+ if(tw != NULL) {
+ /* Give us an identity. */
+ tw->daddr = sk->daddr;
+ tw->rcv_saddr = sk->rcv_saddr;
+ tw->bound_dev_if= sk->bound_dev_if;
+ tw->num = sk->num;
+ tw->state = TCP_TIME_WAIT;
+ tw->family = sk->family;
+ tw->source = sk->dummy_th.source;
+ tw->dest = sk->dummy_th.dest;
+ tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
+ tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if(tw->family == AF_INET6) {
+ memcpy(&tw->v6_daddr,
+ &sk->net_pinfo.af_inet6.daddr,
+ sizeof(struct in6_addr));
+ memcpy(&tw->v6_rcv_saddr,
+ &sk->net_pinfo.af_inet6.rcv_saddr,
+ sizeof(struct in6_addr));
+ }
+#endif
+ /* Linkage updates. */
+ tcp_tw_hashdance(sk, tw);
+
+ /* Get the TIME_WAIT timeout firing. */
+ init_timer(&tw->timer);
+ tw->timer.function = tcp_timewait_kill;
+ tw->timer.data = (unsigned long) tw;
+ tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
+ add_timer(&tw->timer);
+
+ /* CLOSE the SK. */
+ if(sk->state == TCP_ESTABLISHED)
+ tcp_statistics.TcpCurrEstab--;
+ sk->state = TCP_CLOSE;
+ net_reset_timer(sk, TIME_DONE,
+ min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
+ } else {
+ /* Sorry, we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ tcp_set_state(sk, TCP_CLOSE);
+ }
+
+ /* Prevent rcvmsg/sndmsg calls, and wake people up. */
+ sk->shutdown = SHUTDOWN_MASK;
+ if(!sk->dead)
+ sk->state_change(sk);
+}
+
/*
* Process the FIN bit. This now behaves as it is supposed to work
* and the FIN takes effect when it is validly part of sequence
@@ -976,17 +1042,9 @@
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
- if(sk->state == TCP_SYN_SENT) {
- /* RFC793 says to drop the segment and return. */
- return 1;
- }
-
- /* XXX This fin_seq thing should disappear... -DaveM */
- tp->fin_seq = skb->end_seq;
+ sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq;
tcp_send_ack(sk);
@@ -1013,12 +1071,6 @@
case TCP_LAST_ACK:
/* RFC793: Remain in the LAST-ACK state. */
break;
- case TCP_TIME_WAIT:
- /* Received a retransmission of the FIN,
- * restart the TIME_WAIT timer.
- */
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- break;
case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
@@ -1035,21 +1087,15 @@
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- sk->shutdown |= SHUTDOWN_MASK;
- tcp_set_state(sk,TCP_TIME_WAIT);
- break;
- case TCP_CLOSE:
- /* Already in CLOSE. */
+ tcp_time_wait(sk);
break;
default:
- /* Only TCP_LISTEN is left, in that case we should never
- * reach this piece of code.
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
*/
printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
break;
};
- return 0;
}
/* This one checks to see if we can put data from the
@@ -1060,7 +1106,7 @@
struct sk_buff *skb;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- while ((skb = skb_peek(&sk->out_of_order_queue))) {
+ while ((skb = skb_peek(&tp->out_of_order_queue))) {
if (after(skb->seq, tp->rcv_nxt))
break;
@@ -1076,6 +1122,8 @@
skb_unlink(skb);
skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = skb->end_seq;
+ if(skb->h.th->fin)
+ tcp_fin(skb, sk, skb->h.th);
}
}
@@ -1094,8 +1142,12 @@
dst_confirm(sk->dst_cache);
skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = skb->end_seq;
+ if(skb->h.th->fin)
+ tcp_fin(skb, sk, skb->h.th);
+ else
+ tp->delayed_acks++;
tcp_ofo_queue(sk);
- if (skb_queue_len(&sk->out_of_order_queue) == 0)
+ if (skb_queue_len(&tp->out_of_order_queue) == 0)
tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
return;
}
@@ -1104,8 +1156,7 @@
if (!after(skb->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an imediate ack. */
SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
-
- tp->delayed_acks = MAX_DELAY_ACK;
+ tcp_enter_quickack_mode(tp);
kfree_skb(skb);
return;
}
@@ -1119,7 +1170,8 @@
}
/* Ok. This is an out_of_order segment, force an ack. */
- tp->delayed_acks = MAX_DELAY_ACK;
+ tp->delayed_acks++;
+ tcp_enter_quickack_mode(tp);
/* Disable header predition. */
tp->pred_flags = 0;
@@ -1127,10 +1179,10 @@
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, skb->seq, skb->end_seq);
- if (skb_peek(&sk->out_of_order_queue) == NULL) {
- skb_queue_head(&sk->out_of_order_queue,skb);
+ if (skb_peek(&tp->out_of_order_queue) == NULL) {
+ skb_queue_head(&tp->out_of_order_queue,skb);
} else {
- for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
+ for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
/* Already there. */
if (skb->seq == skb1->seq && skb->len >= skb1->len) {
skb_append(skb1, skb);
@@ -1145,8 +1197,8 @@
}
/* See if we've hit the start. If so insert. */
- if (skb1 == skb_peek(&sk->out_of_order_queue)) {
- skb_queue_head(&sk->out_of_order_queue,skb);
+ if (skb1 == skb_peek(&tp->out_of_order_queue)) {
+ skb_queue_head(&tp->out_of_order_queue,skb);
break;
}
}
@@ -1172,23 +1224,17 @@
if (skb->len == 0 && !th->fin)
return(0);
- /* FIXME: don't accept data after the received fin.
- *
- * Would checking snd_seq against fin_seq be enough?
- * If so, how do we handle that case exactly? -DaveM
- */
-
/* We no longer have anyone receiving data on this connection. */
tcp_data_queue(sk, skb);
- if (before(tp->rcv_nxt, sk->copied_seq)) {
+ if (before(tp->rcv_nxt, tp->copied_seq)) {
printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
- tp->rcv_nxt = sk->copied_seq;
+ tp->rcv_nxt = tp->copied_seq;
}
- tp->delayed_acks++;
-
- /* Now tell the user we may have some data. */
+ /* Above, tcp_data_queue() increments delayed_acks appropriately.
+ * Now tell the user we may have some data.
+ */
if (!sk->dead) {
SOCK_DEBUG(sk, "Data wakeup.\n");
sk->data_ready(sk,0);
@@ -1204,23 +1250,10 @@
if ((skb = tp->send_head)) {
if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
tp->packets_out < tp->snd_cwnd ) {
- /* Add more data to the send queue. */
-
- /* FIXME: the congestion window is checked
- * again in tcp_write_xmit anyway?! -- erics
- *
- * I think it must, it bumps tp->packets_out for
- * each packet it fires onto the wire. -DaveM
- */
+ /* Put more data onto the wire. */
tcp_write_xmit(sk);
- if(!sk->dead)
- sk->write_space(sk);
} else if (tp->packets_out == 0 && !tp->pending) {
- /* Data to queue but no room. */
-
- /* FIXME: Is it right to do a zero window probe into
- * a congestion window limited window??? -- erics
- */
+ /* Start probing the receivers window. */
tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
}
}
@@ -1240,12 +1273,24 @@
* - delay time <= 0.5 HZ
* - we don't have a window update to send
* - must send at least every 2 full sized packets
+ *
+ * With an extra heuristic to handle loss of packet
+ * situations and also helping the sender leave slow
+ * start in an expediant manner.
*/
- if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk))
+ /* Two full frames received or... */
+ if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) ||
+ /* We will update the window "significantly" or... */
+ tcp_raise_window(sk) ||
+ /* We entered "quick ACK" mode */
+ tcp_in_quickack_mode(tp)) {
+ /* Then ack it now */
tcp_send_ack(sk);
- else
- tcp_send_delayed_ack(sk, HZ/2);
+ } else {
+ /* Else, send delayed ack. */
+ tcp_send_delayed_ack(tp, HZ/2);
+ }
}
static __inline__ void tcp_ack_snd_check(struct sock *sk)
@@ -1279,11 +1324,11 @@
ptr += ntohl(th->seq);
/* Ignore urgent data that we've already seen and read. */
- if (after(sk->copied_seq, ptr))
+ if (after(tp->copied_seq, ptr))
return;
/* Do we already have a newer (or duplicate) urgent pointer? */
- if (sk->urg_data && !after(ptr, sk->urg_seq))
+ if (tp->urg_data && !after(ptr, tp->urg_seq))
return;
/* Tell the world about our new urgent pointer. */
@@ -1296,14 +1341,14 @@
/* We may be adding urgent data when the last byte read was
* urgent. To do this requires some care. We cannot just ignore
- * sk->copied_seq since we would read the last urgent byte again
+ * tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the sematics of SIOCATMARK (and thus sockatmark())
*/
- if (sk->urg_seq == sk->copied_seq)
- sk->copied_seq++; /* Move the copied sequence on correctly */
- sk->urg_data = URG_NOTYET;
- sk->urg_seq = ptr;
+ if (tp->urg_seq == tp->copied_seq)
+ tp->copied_seq++; /* Move the copied sequence on correctly */
+ tp->urg_data = URG_NOTYET;
+ tp->urg_seq = ptr;
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -1312,17 +1357,19 @@
/* This is the 'fast' part of urgent handling. */
static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
/* Check if we get a new urgent pointer - normally not. */
if (th->urg)
tcp_check_urg(sk,th);
/* Do we wait for any urgent data? - normally not... */
- if (sk->urg_data == URG_NOTYET) {
- u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4);
+ if (tp->urg_data == URG_NOTYET) {
+ u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
/* Is the urgent pointer pointing into this packet? */
if (ptr < len) {
- sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+ tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
if (!sk->dead)
sk->data_ready(sk,0);
}
@@ -1335,33 +1382,39 @@
*/
static void prune_queue(struct sock *sk)
{
- struct tcp_opt *tp;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct sk_buff * skb;
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", sk->copied_seq);
+ SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
/* First Clean the out_of_order queue. */
/* Start with the end because there are probably the least
* useful packets (crossing fingers).
*/
- while ((skb = skb_dequeue_tail(&sk->out_of_order_queue))) {
+ while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) {
kfree_skb(skb);
if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
return;
}
- tp = &sk->tp_pinfo.af_tcp;
-
/* Now continue with the receive queue if it wasn't enough */
while ((skb = skb_peek_tail(&sk->receive_queue))) {
+ /* Never toss anything when we've seen the FIN.
+ * It's just too complex to recover from it.
+ */
+ if(skb->h.th->fin)
+ break;
+
/* Never remove packets that have been already acked */
if (before(skb->end_seq, tp->last_ack_sent+1)) {
printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n",
- sk->copied_seq, skb->end_seq, tp->last_ack_sent);
+ tp->copied_seq, skb->end_seq, tp->last_ack_sent);
break;
}
skb_unlink(skb);
tp->rcv_nxt = skb->seq;
+ SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
+ skb->seq, skb->end_seq, tp->copied_seq);
kfree_skb(skb);
if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
break;
@@ -1429,7 +1482,6 @@
}
} else if (skb->ack_seq == tp->snd_una) {
/* Bulk data transfer: receiver */
-
if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
goto discard;
@@ -1441,18 +1493,13 @@
skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = skb->end_seq;
+ /* FIN bit check is not done since if FIN is set in
+ * this frame, the pred_flags won't match up. -DaveM
+ */
sk->data_ready(sk, 0);
tcp_delack_estimator(tp);
-
-#if 1 /* This checks for required window updates too. */
tp->delayed_acks++;
__tcp_ack_snd_check(sk);
-#else
- if (tp->delayed_acks++ == 0)
- tcp_send_delayed_ack(sk, HZ/2);
- else
- tcp_send_ack(sk);
-#endif
return 0;
}
}
@@ -1469,7 +1516,7 @@
}
}
- if(th->syn && skb->seq != sk->syn_seq) {
+ if(th->syn && skb->seq != tp->syn_seq) {
SOCK_DEBUG(sk, "syn in established state\n");
tcp_statistics.TcpInErrs++;
tcp_reset(sk, skb);
@@ -1490,10 +1537,6 @@
/* step 7: process the segment text */
queued = tcp_data(skb, sk, len);
- /* step 8: check the FIN bit */
- if (th->fin)
- (void) tcp_fin(skb, sk, th);
-
tcp_data_snd_check(sk);
/* If our receive queue has grown past its limits shrink it */
@@ -1657,19 +1700,19 @@
tp->snd_wnd = htons(th->window) << tp->snd_wscale;
tp->snd_wl1 = skb->seq;
tp->snd_wl2 = skb->ack_seq;
-
tp->fin_seq = skb->seq;
tcp_set_state(sk, TCP_ESTABLISHED);
tcp_parse_options(th,tp,0);
- /* FIXME: need to make room for SACK still */
+
if (tp->wscale_ok == 0) {
tp->snd_wscale = tp->rcv_wscale = 0;
tp->window_clamp = min(tp->window_clamp,65535);
}
if (tp->tstamp_ok) {
- tp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: Define constant! */
- sk->dummy_th.doff += 3; /* reserve space of options */
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ sk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
} else
tp->tcp_header_len = sizeof(struct tcphdr);
if (tp->saw_tstamp) {
@@ -1680,14 +1723,30 @@
/* Can't be earlier, doff would be wrong. */
tcp_send_ack(sk);
- if (tp->in_mss)
- sk->mss = min(sk->mss, tp->in_mss);
-
- /* Take out space for tcp options. */
- sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr);
+ /* Check for the case where we tried to advertise
+ * a window including timestamp options, but did not
+ * end up using them for this connection.
+ */
+ if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps)
+ sk->mss += TCPOLEN_TSTAMP_ALIGNED;
+ /* Now limit it if the other end negotiated a smaller
+ * value.
+ */
+ if (tp->in_mss) {
+ int real_mss = tp->in_mss;
+
+ /* We store MSS locally with the timestamp bytes
+ * subtracted, TCP's advertise it with them
+ * included. Account for this fact.
+ */
+ if(tp->tstamp_ok)
+ real_mss -= TCPOLEN_TSTAMP_ALIGNED;
+ sk->mss = min(sk->mss, real_mss);
+ }
+
sk->dummy_th.dest = th->source;
- sk->copied_seq = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
if(!sk->dead) {
sk->state_change(sk);
@@ -1722,52 +1781,10 @@
}
break;
-
- case TCP_TIME_WAIT:
- /* RFC 1122:
- * "When a connection is [...] on TIME-WAIT state [...]
- * [a TCP] MAY accept a new SYN from the remote TCP to
- * reopen the connection directly, if it:
- *
- * (1) assigns its initial sequence number for the new
- * connection to be larger than the largest sequence
- * number it used on the previous connection incarnation,
- * and
- *
- * (2) returns to TIME-WAIT state if the SYN turns out
- * to be an old duplicate".
- */
- if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) {
- __u32 isn;
-
- skb_orphan(skb);
- sk->err = ECONNRESET;
- tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
-
- isn = tp->rcv_nxt + 128000;
- if (isn == 0)
- isn++;
-
- sk = tp->af_specific->get_sock(skb, th);
-
- if (sk == NULL || !ipsec_sk_policy(sk,skb))
- goto discard;
-
- skb_set_owner_r(skb, sk);
- tp = &sk->tp_pinfo.af_tcp;
-
- if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
- return 1;
-
- goto discard;
- }
-
- break;
}
/* Parse the tcp_options present on this header.
- * By this point we really only expect timestamps and SACKs.
+ * By this point we really only expect timestamps.
* Note that this really has to be here and not later for PAWS
* (RFC1323) to work.
*/
@@ -1819,7 +1836,7 @@
* original syn.
*/
- if (th->syn && skb->seq!=sk->syn_seq) {
+ if (th->syn && skb->seq!=tp->syn_seq) {
tcp_reset(sk, skb);
return 1;
}
@@ -1833,7 +1850,7 @@
if (acceptable) {
tcp_set_state(sk, TCP_ESTABLISHED);
sk->dummy_th.dest=th->source;
- sk->copied_seq = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
if(!sk->dead)
sk->state_change(sk);
@@ -1850,7 +1867,7 @@
break;
case TCP_FIN_WAIT1:
- if (tp->snd_una == sk->write_seq) {
+ if (tp->snd_una == tp->write_seq) {
sk->shutdown |= SEND_SHUTDOWN;
tcp_set_state(sk, TCP_FIN_WAIT2);
if (!sk->dead)
@@ -1861,12 +1878,12 @@
break;
case TCP_CLOSING:
- if (tp->snd_una == sk->write_seq)
+ if (tp->snd_una == tp->write_seq)
tcp_time_wait(sk);
break;
case TCP_LAST_ACK:
- if (tp->snd_una == sk->write_seq) {
+ if (tp->snd_una == tp->write_seq) {
sk->shutdown = SHUTDOWN_MASK;
tcp_set_state(sk,TCP_CLOSE);
if (!sk->dead)
@@ -1874,13 +1891,6 @@
goto discard;
}
break;
-
- case TCP_TIME_WAIT:
- /* Keep us in TIME_WAIT until we stop getting
- * packets, reset the timeout.
- */
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- break;
}
} else
goto discard;
@@ -1916,12 +1926,6 @@
if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
prune_queue(sk);
break;
- }
-
- /* step 8: check the FIN bit */
- if (th->fin) {
- if(tcp_fin(skb, sk, th) != 0)
- goto discard;
}
tcp_data_snd_check(sk);
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov