patch-2.1.8 linux/net/ipv4/tcp_input.c
Next file: linux/net/ipv4/tcp_ipv4.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index
- Lines: 3161
- Date:
Sun Nov 3 11:04:45 1996
- Orig file:
v2.1.7/linux/net/ipv4/tcp_input.c
- Orig date:
Thu Oct 10 19:10:58 1996
diff -u --recursive --new-file v2.1.7/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -18,81 +18,85 @@
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * TODO
+ * - A better sock cache
+ *
+ */
+
+/*
+ * Changes:
+ * Pedro Roque : Fast Retransmit/Recovery.
+ * Two receive queues.
+ * Retransmit queue handled by TCP.
+ * Better retransmit timer handling.
+ * New congestion avoidance.
+ * Header prediction.
+ * Variable renaming.
*
- * FIXES
- * Pedro Roque : Double ACK bug
+ * Eric : Fast Retransmit.
+ * Randy Scott : MSS option defines.
* Eric Schenk : Fixes to slow start algorithm.
* Eric Schenk : Yet another double ACK bug.
* Eric Schenk : Delayed ACK bug fixes.
* Eric Schenk : Floyd style fast retrans war avoidance.
- * Eric Schenk : Skip fast retransmit on small windows.
- * Eric schenk : Fixes to retransmission code to
- * : avoid extra retransmission.
- * Theodore Ts'o : Do secure TCP sequence numbers.
*/
#include <linux/config.h>
-#include <linux/types.h>
-#include <linux/random.h>
#include <net/tcp.h>
+
/*
- * Policy code extracted so it's now separate
+ * Policy code extracted so it's now seperate
*/
/*
* Called each time to estimate the delayed ack timeout. This is
- * how it should be done so a fast link isn't impacted by ack delay.
+ * how it should be done so a fast link isnt impacted by ack delay.
+ *
+ * I think we need a medium deviation here also...
+ * The estimated value is changing to fast
*/
-extern __inline__ void tcp_delack_estimator(struct sock *sk)
+extern __inline__ void tcp_delack_estimator(struct tcp_opt *tp)
{
+ int m;
+
/*
* Delayed ACK time estimator.
*/
- if (sk->lrcvtime == 0)
- {
- sk->lrcvtime = jiffies;
- sk->ato = HZ/3;
- }
- else
+ m = jiffies - tp->lrcvtime;
+
+ tp->lrcvtime = jiffies;
+
+ if (m < 0)
+ return;
+
+ /*
+ * if the mesured value is bigger than
+ * twice the round trip time ignore it.
+ */
+ if ((m << 2) <= tp->srtt)
{
- int m;
-
- m = jiffies - sk->lrcvtime;
+ m -= (tp->iat >> 3);
+ tp->iat += m;
- sk->lrcvtime = jiffies;
+ if (m <0)
+ m = -m;
- if (m <= 0)
- m = 1;
+ m -= (tp->iat_mdev >> 2);
+ tp->iat_mdev += m;
- /* This used to test against sk->rtt.
- * On a purely receiving link, there is no rtt measure.
- * The result is that we lose delayed ACKs on one-way links.
- * Therefore we test against sk->rto, which will always
- * at least have a default value.
- */
- if (m > sk->rto)
- {
- sk->ato = sk->rto;
- /*
- * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
- */
- }
- else
- {
- /*
- * Very fast acting estimator.
- * May fluctuate too much. Probably we should be
- * doing something like the rtt estimator here.
- */
- sk->ato = (sk->ato >> 1) + m;
- /*
- * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
- */
- }
+ tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2);
+
+ if (tp->ato < HZ/50)
+ tp->ato = HZ/50;
}
+ else
+ tp->ato = 0;
}
/*
@@ -100,8 +104,8 @@
* retransmitted [see Karn/Partridge Proceedings SIGCOMM 87].
* The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
*/
-
-extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
+
+extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
{
long m;
/*
@@ -111,130 +115,72 @@
* This is designed to be as fast as possible
* m stands for "measurement".
*/
-
- m = jiffies - oskb->when; /* RTT */
+ /*
+ * On a 1990 paper the rto value is changed to:
+ * RTO = rtt + 4 * mdev
+ */
- if (sk->rtt != 0) {
+ m = mrtt; /* RTT */
+
+ if (tp->srtt != 0) {
if(m<=0)
m=1; /* IS THIS RIGHT FOR <0 ??? */
- m -= (sk->rtt >> 3); /* m is now error in rtt est */
- sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ m -= (tp->srtt >> 3); /* m is now error in rtt est */
+ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0)
m = -m; /* m is now abs(error) */
- m -= (sk->mdev >> 2); /* similar update on mdev */
- sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
+ m -= (tp->mdev >> 2); /* similar update on mdev */
+ tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
} else {
- /* no previous measure. */
- sk->rtt = m<<3; /* take the measured time to be rtt */
- sk->mdev = m<<1; /* make sure rto = 3*rtt */
+ /* no previous measure. */
+ tp->srtt = m<<3; /* take the measured time to be rtt */
+ tp->mdev = m<<2; /* make sure rto = 3*rtt */
}
+
/*
* Now update timeout. Note that this removes any backoff.
*/
- /* Jacobson's algorithm calls for rto = R + 4V.
- * We diverge from Jacobson's algorithm here. See the commentary
- * in tcp_ack to understand why.
- */
- sk->rto = (sk->rtt >> 3) + sk->mdev;
- sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
- if (sk->rto > 120*HZ)
- sk->rto = 120*HZ;
- if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
- sk->rto = HZ/5;
- sk->backoff = 0;
-}
-
-/*
- * Cached last hit socket
- */
-
-static volatile unsigned long th_cache_saddr, th_cache_daddr;
-static volatile unsigned short th_cache_dport, th_cache_sport;
-static volatile struct sock *th_cache_sk;
-
-void tcp_cache_zap(void)
-{
- th_cache_sk=NULL;
-}
+ tp->rto = (tp->srtt >> 3) + tp->mdev;
-/*
- * Find the socket, using the last hit cache if applicable. The cache is not quite
- * right...
- */
+ if (tp->rto > 120*HZ)
+ tp->rto = 120*HZ;
-static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport, u32 paddr, u16 pport)
-{
- struct sock * sk;
+ /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
+ if (tp->rto < HZ/5)
+ tp->rto = HZ/5;
- sk = (struct sock *) th_cache_sk;
- if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
- sport != th_cache_sport || dport != th_cache_dport) {
- sk = get_sock(&tcp_prot, dport, saddr, sport, daddr, paddr, pport);
- if (sk) {
- th_cache_saddr=saddr;
- th_cache_daddr=daddr;
- th_cache_dport=dport;
- th_cache_sport=sport;
- th_cache_sk=sk;
- }
- }
- return sk;
+ tp->backoff = 0;
}
+
/*
- * React to an out-of-window TCP sequence number in an incoming packet
+ * This functions checks to see if the tcp header is actually acceptable.
*/
-static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
- struct device *dev)
+extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt)
{
- if (th->rst)
- return;
+ u32 end_window = tp->rcv_wup + tp->rcv_wnd;
+ u32 end_seq = seg_nxt;
/*
- * Send a reset if we get something not ours and we are
- * unsynchronized. Note: We don't do anything to our end. We
- * are just killing the bogus remote connection then we will
- * connect again and it will work (with luck).
+ * When the window is open (most common case)
+ * we want to accept segments if they have yet unseen data
+ * or in the case of a dataless segment if seg.seq == rcv.nxt
+ * this means:
+ *
+ * if (seq == end_seq)
+ * end_seq >= rcv.nxt
+ * else
+ * end_seq > rcv.nxt
*/
-
- if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
- {
- tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
- return;
- }
- /*
- * This packet is old news. Usually this is just a resend
- * from the far end, but sometimes it means the far end lost
- * an ACK we sent, so we better send an ACK.
- */
- /*
- * BEWARE! Unconditional answering by ack to out-of-window ack
- * can result in infinite exchange of empty acks.
- * This check cures bug, found by Michiel Boland, but
- * not another possible cases.
- * If we are in TCP_TIME_WAIT, we have already received
- * FIN, so that our peer need not window update. If our
- * ACK were lost, peer would retransmit his FIN anyway. --ANK
- */
- if (sk->state != TCP_TIME_WAIT || ntohl(th->seq) != end_seq)
- tcp_send_ack(sk);
-}
+ if (seq == end_seq)
+ end_seq++;
-/*
- * This functions checks to see if the tcp header is actually acceptable.
- */
-
-extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
-{
- u32 end_window = sk->lastwin_seq + sk->window;
- return /* if start is at end of window, end must be too (zero window) */
- (seq == end_window && seq == end_seq) ||
- /* if start is before end of window, check for interest */
- (before(seq, end_window) && !before(end_seq, sk->acked_seq));
+ return ((before(seq, end_window) && after(end_seq, tp->rcv_nxt)) ||
+ (seq == end_window && seq == end_seq));
}
/*
@@ -273,7 +219,7 @@
#endif
if (!sk->dead)
sk->state_change(sk);
- kfree_skb(skb, FREE_READ);
+
return(0);
}
@@ -289,11 +235,11 @@
* as Linux gets deployed on 100Mb/sec networks.
*/
-static void tcp_options(struct sock *sk, struct tcphdr *th)
+int tcp_parse_options(struct tcphdr *th)
{
unsigned char *ptr;
int length=(th->doff*4)-sizeof(struct tcphdr);
- int mss_seen = 0;
+ int mss = 0;
ptr = (unsigned char *)(th + 1);
@@ -304,7 +250,7 @@
switch(opcode)
{
case TCPOPT_EOL:
- return;
+ return 0;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
ptr--; /* the opsize=*ptr++ above was a mistake */
@@ -312,14 +258,13 @@
default:
if(opsize<=2) /* Avoid silly options looping forever */
- return;
+ return 0;
switch(opcode)
{
case TCPOPT_MSS:
- if(opsize==4 && th->syn)
+ if(opsize==TCPOLEN_MSS && th->syn)
{
- sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
- mss_seen = 1;
+ mss = ntohs(*(unsigned short *)ptr);
}
break;
/* Add other options here as people feel the urge to implement stuff like large windows */
@@ -328,612 +273,420 @@
length-=opsize;
}
}
- if (th->syn)
- {
- if (! mss_seen)
- sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
- }
-#ifdef CONFIG_INET_PCTCP
- sk->mss = min(sk->max_window >> 1, sk->mtu);
-#else
- sk->mss = min(sk->max_window, sk->mtu);
- sk->max_unacked = 2 * sk->mss;
-#endif
+
+ return mss;
}
-/*
- * This routine handles a connection request.
- * It should make sure we haven't already responded.
- * Because of the way BSD works, we have to send a syn/ack now.
- * This also means it will be harder to close a socket which is
- * listening.
+/*
+ * See draft-stevens-tcpca-spec-01 for documentation.
*/
-
-static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
- u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
-{
- struct sock *newsk;
- struct tcphdr *th;
- struct rtable *rt;
-
- th = skb->h.th;
- /* If the socket is dead, don't accept the connection. */
- if (!sk->dead)
- {
- sk->data_ready(sk,0);
- }
- else
- {
- if(sk->debug)
- printk("Reset on %p: Connect on dead socket.\n",sk);
- tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
+static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
+{
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
/*
- * Make sure we can accept more. This will prevent a
- * flurry of syns from eating up all our memory.
- *
- * BSD does some funnies here and allows 3/2 times the
- * set backlog as a fudge factor. That's just too gross.
+ * An ACK is a duplicate if:
+ * (1) it has the same sequence number as the largest number we've
+ * seen,
+ * (2) it has the same window as the last ACK,
+ * (3) we have outstanding data that has not been ACKed
+ * (4) The packet was not carrying any data.
+ * (5) [From Floyds paper on fast retransmit wars]
+ * The packet acked data after high_seq;
*/
- if (sk->ack_backlog >= sk->max_ack_backlog)
+ if (ack == tp->snd_una && sk->packets_out && (not_dup == 0) &&
+ after(ack, tp->high_seq))
{
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
-
- /*
- * We need to build a new sock struct.
- * It is sort of bad to have a socket without an inode attached
- * to it, but the wake_up's will just wake up the listening socket,
- * and if the listening socket is destroyed before this is taken
- * off of the queue, this will take care of it.
- */
+
+ sk->dup_acks++;
+
- newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
- if (newsk == NULL)
- {
- /* just ignore the syn. It will get retransmitted. */
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
- }
+ /*
+ * 1. When the third duplicate ack is received, set ssthresh
+ * to one half the current congestion window, but no less
+ * than two segments. Retransmit the missing segment.
+ */
+
+ if (sk->dup_acks == 3)
+ {
+ sk->ssthresh = max(sk->cong_window >> 1, 2);
+ sk->cong_window = sk->ssthresh + 3;
+ tcp_do_retransmit(sk, 0);
+ }
- memcpy(newsk, sk, sizeof(*newsk));
- newsk->opt = NULL;
- newsk->ip_route_cache = NULL;
- if (opt && opt->optlen)
- {
- sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
- if (!sk->opt)
+ /*
+ * 2. Each time another duplicate ACK arrives, increment
+ * cwnd by the segment size. [...] Transmit a packet...
+ *
+ * Packet transmission will be done on normal flow processing
+ * since we're not in "retransmit mode"
+ */
+
+ if (sk->dup_acks > 3)
{
- kfree_s(newsk, sizeof(struct sock));
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
+ sk->cong_window++;
}
- if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
+ }
+ else
+ {
+ /*
+ * 3. When the next ACK arrives that acknowledges new data,
+ * set cwnd to ssthresh
+ */
+
+ if (sk->dup_acks >= 3)
{
- kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
- kfree_s(newsk, sizeof(struct sock));
- tcp_statistics.TcpAttemptFails++;
- kfree_skb(skb, FREE_READ);
- return;
+ sk->tp_pinfo.af_tcp.retrans_head = NULL;
+ sk->cong_window = sk->ssthresh;
+ sk->retransmits = 0;
}
+ sk->dup_acks = 0;
}
- skb_queue_head_init(&newsk->write_queue);
- skb_queue_head_init(&newsk->receive_queue);
- newsk->send_head = NULL;
- newsk->send_tail = NULL;
- newsk->send_next = NULL;
- skb_queue_head_init(&newsk->back_log);
- newsk->rtt = 0;
- newsk->rto = TCP_TIMEOUT_INIT;
- newsk->mdev = TCP_TIMEOUT_INIT;
- newsk->max_window = 0;
- /*
- * See draft-stevens-tcpca-spec-01 for discussion of the
- * initialization of these values.
- */
- newsk->cong_window = 1;
- newsk->cong_count = 0;
- newsk->ssthresh = 0x7fffffff;
-
- newsk->lrcvtime = 0;
- newsk->idletime = 0;
- newsk->high_seq = 0;
- newsk->backoff = 0;
- newsk->blog = 0;
- newsk->intr = 0;
- newsk->proc = 0;
- newsk->done = 0;
- newsk->partial = NULL;
- newsk->pair = NULL;
- newsk->wmem_alloc = 0;
- newsk->rmem_alloc = 0;
- newsk->localroute = sk->localroute;
-
- newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
-
- newsk->err = 0;
- newsk->shutdown = 0;
- newsk->ack_backlog = 0;
- newsk->acked_seq = skb->seq+1;
- newsk->lastwin_seq = skb->seq+1;
- newsk->delay_acks = 1;
- newsk->copied_seq = skb->seq+1;
- newsk->fin_seq = skb->seq;
- newsk->syn_seq = skb->seq;
- newsk->state = TCP_SYN_RECV;
- newsk->timeout = 0;
- newsk->ip_xmit_timeout = 0;
- newsk->write_seq = seq;
- newsk->window_seq = newsk->write_seq;
- newsk->rcv_ack_seq = newsk->write_seq;
- newsk->urg_data = 0;
- newsk->retransmits = 0;
- newsk->linger=0;
- newsk->destroy = 0;
- init_timer(&newsk->timer);
- newsk->timer.data = (unsigned long)newsk;
- newsk->timer.function = &net_timer;
- init_timer(&newsk->delack_timer);
- newsk->delack_timer.data = (unsigned long)newsk;
- newsk->delack_timer.function = tcp_delack_timer;
- init_timer(&newsk->retransmit_timer);
- newsk->retransmit_timer.data = (unsigned long)newsk;
- newsk->retransmit_timer.function = tcp_retransmit_timer;
- newsk->dummy_th.source = skb->h.th->dest;
- newsk->dummy_th.dest = skb->h.th->source;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /*
- * Deal with possibly redirected traffic by setting num to
- * the intended destination port of the received packet.
- */
- newsk->num = ntohs(skb->h.th->dest);
-
-#endif
- /*
- * Swap these two, they are from our point of view.
- */
-
- newsk->daddr = saddr;
- newsk->saddr = daddr;
- newsk->rcv_saddr = daddr;
+}
- put_sock(newsk->num,newsk);
- newsk->acked_seq = skb->seq + 1;
- newsk->copied_seq = skb->seq + 1;
- newsk->socket = NULL;
+int sysctl_tcp_vegas_cong_avoidance = 1;
- /*
- * Grab the ttl and tos values and use them
- */
+/*
+ * TCP slow start and congestion avoidance in two flavors:
+ * RFC 1122 and TCP Vegas.
+ *
+ * This is a /proc/sys configurable option.
+ */
- newsk->ip_ttl=sk->ip_ttl;
- newsk->ip_tos=skb->ip_hdr->tos;
+#define SHIFT_FACTOR 12
+static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
+ u32 seq_rtt)
+{
/*
- * Use 512 or whatever user asked for
+ * From:
+ * TCP Vegas: New Techniques for Congestion
+ * Detection and Avoidance.
+ *
+ *
+ * Warning: This code is a scratch implementation taken
+ * from the paper only. The code they distribute seams
+ * to have improved several things over the initial spec.
*/
- /*
- * Note use of sk->user_mss, since user has no direct access to newsk
- */
+ u32 Actual, Expected;
+ u32 snt_bytes;
+ struct tcp_opt * tp;
- rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
- newsk->ip_route_cache = rt;
+ tp = &(sk->tp_pinfo.af_tcp);
+
+ if (!seq_rtt)
+ seq_rtt = 1;
- if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
- newsk->window_clamp = rt->rt_window;
+ if (tp->basertt)
+ tp->basertt = min(seq_rtt, tp->basertt);
else
- newsk->window_clamp = 0;
+ tp->basertt = seq_rtt;
+
- if (sk->user_mss)
- newsk->mtu = sk->user_mss;
- else if (rt)
- newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
- else
- newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
-
- /*
- * But not bigger than device MTU
- */
-
- newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
-
-#ifdef CONFIG_SKIP
-
- /*
- * SKIP devices set their MTU to 65535. This is so they can take packets
- * unfragmented to security process then fragment. They could lie to the
- * TCP layer about a suitable MTU, but it's easier to let skip sort it out
- * simply because the final package we want unfragmented is going to be
- *
- * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
- */
-
- if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
- sk->mtu=skip_pick_mtu(sk->mtu,dev);
-#endif
/*
- * This will min with what arrived in the packet
+ *
+ * Actual = throughput for this segment.
+ * Expected = number_of_bytes in transit / BaseRTT
+ *
*/
- tcp_options(newsk,skb->h.th);
-
- tcp_cache_zap();
- tcp_send_synack(newsk, sk, skb);
-}
-
+ snt_bytes = (ack - seq) << SHIFT_FACTOR;
+
+ Actual = snt_bytes / seq_rtt;
+ Expected = ((tp->snd_nxt - tp->snd_una) << SHIFT_FACTOR) / tp->basertt;
-/*
- * Handle a TCP window that shrunk on us. It shouldn't happen,
- * but..
- *
- * We may need to move packets from the send queue
- * to the write queue, if the window has been shrunk on us.
- * The RFC says you are not allowed to shrink your window
- * like this, but if the other end does, you must be able
- * to deal with it.
- */
-void tcp_window_shrunk(struct sock * sk, u32 window_seq)
-{
- struct sk_buff *skb;
- struct sk_buff *skb2;
- struct sk_buff *wskb = NULL;
-
- skb2 = sk->send_head;
- sk->send_head = NULL;
- sk->send_tail = NULL;
- sk->send_next = NULL;
-
- /*
- * This is an artifact of a flawed concept. We want one
- * queue and a smarter send routine when we send all.
- */
- cli();
- while (skb2 != NULL)
- {
- skb = skb2;
- skb2 = skb->link3;
- skb->link3 = NULL;
- if (after(skb->end_seq, window_seq))
- {
- if (sk->packets_out > 0)
- sk->packets_out--;
- /* We may need to remove this from the dev send list. */
- if (skb->next != NULL)
- {
- skb_unlink(skb);
- }
- /* Now add it to the write_queue. */
- if (wskb == NULL)
- skb_queue_head(&sk->write_queue,skb);
- else
- skb_append(wskb,skb);
- wskb = skb;
- }
- else
- {
- if (sk->send_head == NULL)
- {
- sk->send_head = skb;
- sk->send_tail = skb;
- sk->send_next = skb;
+/*
+ printk(KERN_DEBUG "A:%x E:%x rtt:%x srtt:%x win: %d\n",
+ Actual, Expected, seq_rtt, tp->srtt, sk->cong_window);
+ */
+ /*
+ * Slow Start
+ */
+
+ if (sk->cong_window < sk->ssthresh &&
+ (seq == tp->snd_nxt ||
+ (((Expected - Actual) <=
+ ((TCP_VEGAS_GAMMA << SHIFT_FACTOR) * sk->mss / tp->basertt))
+ )
+ ))
+ {
+
+ /*
+ * "Vegas allows exponential growth only every other
+ * RTT"
+ */
+
+ if (sk->cong_count || sk->cong_window <= 2)
+ {
+ sk->cong_window++;
+ sk->cong_count = 0;
+ }
+ else
+ sk->cong_count++;
+ }
+ else
+ {
+ /*
+ * Congestion Avoidance
+ */
+
+ if (Expected - Actual <=
+ ((TCP_VEGAS_ALPHA << SHIFT_FACTOR) * sk->mss / tp->basertt))
+ {
+ /* Increase Linearly */
+
+ if (sk->cong_count >= sk->cong_window)
+ {
+ sk->cong_window++;
+ sk->cong_count = 0;
}
else
+ sk->cong_count++;
+ }
+
+ if (Expected - Actual >=
+ ((TCP_VEGAS_BETA << SHIFT_FACTOR) * sk->mss / tp->basertt))
+ {
+ /* Decrease Linearly */
+
+ if (sk->cong_count >= sk->cong_window)
{
- sk->send_tail->link3 = skb;
- sk->send_tail = skb;
+ sk->cong_window--;
+ sk->cong_count = 0;
}
- skb->link3 = NULL;
+ else
+ sk->cong_count++;
+
+
+ /* Never less than 2 segments */
+ if (sk->cong_window < 2)
+ sk->cong_window = 2;
}
+
+
+ }
+}
+
+static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt)
+{
+
+ /*
+ * This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328. Because we keep cong_window in
+ * integral mss's, we can't do cwnd += 1 / cwnd.
+ * Instead, maintain a counter and increment it once every
+ * cwnd times.
+ */
+
+ if (sk->cong_window <= sk->ssthresh)
+ {
+ /*
+ * In "safe" area, increase
+ */
+
+ sk->cong_window++;
}
- sti();
+ else
+ {
+ /*
+ * In dangerous area, increase slowly.
+ * In theory this is
+ * sk->cong_window += 1 / sk->cong_window
+ */
+
+ if (sk->cong_count >= sk->cong_window) {
+
+ sk->cong_window++;
+ sk->cong_count = 0;
+ }
+ else
+ sk->cong_count++;
+ }
}
+#define FLAG_DATA 0x01
+#define FLAG_WIN_UPDATE 0x02
+#define FLAG_DATA_ACKED 0x04
+
/*
* This routine deals with incoming acks, but not outgoing ones.
- *
- * This routine is totally _WRONG_. The list structuring is wrong,
- * the algorithm is wrong, the code is wrong.
*/
-static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
+static int tcp_ack(struct sock *sk, struct tcphdr *th,
+ u32 ack_seq, u32 ack, int len)
{
int flag = 0;
- u32 window_seq;
+ u32 seq = 0;
+ u32 seq_rtt = 0;
+ struct sk_buff *skb;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /*
- * 1 - there was data in packet as well as ack or new data is sent or
- * in shutdown state
- * 2 - data from retransmit queue was acked and removed
- * 4 - window shrunk or data from retransmit queue was acked and removed
- */
if(sk->zapped)
return(1); /* Dead, can't ack any more so why bother */
- /*
- * We have dropped back to keepalive timeouts. Thus we have
- * no retransmits pending.
- */
- if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
- sk->retransmits = 0;
+ if (tp->pending == TIME_KEEPOPEN)
+ {
+ tp->probes_out = 0;
+ }
+ tp->rcv_tstamp = jiffies;
+
/*
* If the ack is newer than sent or older than previous acks
* then we can probably ignore it.
*/
- if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
+ if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
goto uninteresting_ack;
/*
- * Have we discovered a larger window
+ * If there is data set flag 1
*/
- window_seq = ntohs(th->window);
- if (window_seq > sk->max_window)
+
+ if (len != th->doff*4)
{
- sk->max_window = window_seq;
-#ifdef CONFIG_INET_PCTCP
- /* Hack because we don't send partial packets to non SWS
- handling hosts */
- sk->mss = min(window_seq>>1, sk->mtu);
-#else
- sk->mss = min(window_seq, sk->mtu);
-#endif
+ flag |= FLAG_DATA;
+ tcp_delack_estimator(tp);
}
- window_seq += ack;
/*
- * See if our window has been shrunk.
+ * Update our send window
*/
- if (after(sk->window_seq, window_seq))
- tcp_window_shrunk(sk, window_seq);
/*
- * Pipe has emptied
- */
- if (sk->send_tail == NULL || sk->send_head == NULL)
+ * This is the window update code as per RFC 793
+ * snd_wl{1,2} are used to prevent unordered
+ * segments from shrinking the window
+ */
+
+ if ((tp->snd_wl1 == 0) || before(tp->snd_wl1, ack_seq) ||
+ (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack)))
{
- sk->send_head = NULL;
- sk->send_tail = NULL;
- sk->send_next = NULL;
- sk->packets_out= 0;
+ tp->snd_wnd = ntohs(th->window);
+ tp->snd_wl1 = ack_seq;
+ tp->snd_wl2 = ack;
+
+ flag |= FLAG_WIN_UPDATE;
+
+ if (tp->snd_wnd > sk->max_window)
+ {
+ sk->max_window = tp->snd_wnd;
+ }
}
+
/*
- * We don't want too many packets out there.
+ * We passed data and got it acked, remove any soft error
+ * log. Something worked...
*/
- if (sk->ip_xmit_timeout == TIME_WRITE &&
- sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
+ sk->err_soft = 0;
+
+ /*
+ * If this ack opens up a zero window, clear backoff. It was
+ * being used to time the probes, and is probably far higher than
+ * it needs to be for normal retransmission.
+ */
+
+ if (tp->pending == TIME_PROBE0)
{
+ tp->probes_out = 0; /* Our probe was answered */
- /*
- * This is Jacobson's slow start and congestion avoidance.
- * SIGCOMM '88, p. 328. Because we keep cong_window in integral
- * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
- * counter and increment it once every cwnd times. It's possible
- * that this should be done only if sk->retransmits == 0. I'm
- * interpreting "new data is acked" as including data that has
- * been retransmitted but is just now being acked.
+ /*
+ * Was it a usable window open ?
*/
- if (sk->cong_window <= sk->ssthresh)
- /*
- * In "safe" area, increase
- */
- sk->cong_window++;
- else
+
+ /* should always be non-null */
+ if (tp->send_head != NULL &&
+ !before (ack + tp->snd_wnd, tp->send_head->end_seq))
{
- /*
- * In dangerous area, increase slowly. In theory this is
- * sk->cong_window += 1 / sk->cong_window
- */
- if (sk->cong_count >= sk->cong_window)
- {
- sk->cong_window++;
- sk->cong_count = 0;
- }
- else
- sk->cong_count++;
- }
- }
+ tp->backoff = 0;
+ tp->pending = 0;
- /*
- * Remember the highest ack received and update the
- * right hand window edge of the host.
- * We do a bit of work here to track number of times we've
- * seen this ack without a change in the right edge of the
- * window and no data in the packet.
- * This will allow us to do fast retransmits.
- */
-
- /* We are looking for duplicate ACKs here.
- * An ACK is a duplicate if:
- * (1) it has the same sequence number as the largest number we've seen,
- * (2) it has the same window as the last ACK,
- * (3) we have outstanding data that has not been ACKed
- * (4) The packet was not carrying any data.
- * (5) [From Floyd's paper on fast retransmit wars]
- * The packet acked data after high_seq;
- * I've tried to order these in occurrence of most likely to fail
- * to least likely to fail.
- * [These are an extension of the rules BSD stacks use to
- * determine if an ACK is a duplicate.]
- */
-
- if (sk->rcv_ack_seq == ack
- && sk->window_seq == window_seq
- && len != th->doff*4
- && before(ack, sk->sent_seq)
- && after(ack, sk->high_seq))
- {
- /* Prevent counting of duplicate ACKs if the congestion
- * window is smaller than 3. Note that since we reduce
- * the congestion window when we do a fast retransmit,
- * we must be careful to keep counting if we were already
- * counting. The idea behind this is to avoid doing
- * fast retransmits if the congestion window is so small
- * that we cannot get 3 ACKs due to the loss of a packet
- * unless we are getting ACKs for retransmitted packets.
- */
- if (sk->cong_window >= 3 || sk->rcv_ack_cnt > MAX_DUP_ACKS+1)
- sk->rcv_ack_cnt++;
- /* See draft-stevens-tcpca-spec-01 for explanation
- * of what we are doing here.
- */
- if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) {
- int tmp;
+ tcp_clear_xmit_timer(sk, TIME_PROBE0);
- /* We need to be a bit careful to preserve the
- * count of packets that are out in the system here.
- */
- sk->ssthresh = max(sk->cong_window >> 1, 2);
- sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
- tmp = sk->packets_out;
- tcp_do_retransmit(sk,0);
- sk->packets_out = tmp;
- } else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) {
- sk->cong_window++;
- /*
- * At this point we are suppose to transmit a NEW
- * packet (not retransmit the missing packet,
- * this would only get us into a retransmit war.)
- * I think that having just adjusted cong_window
- * we will transmit the new packet below.
- */
- }
- }
- else
- {
- if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
- sk->cong_window = sk->ssthresh;
}
- sk->window_seq = window_seq;
- sk->rcv_ack_seq = ack;
- sk->rcv_ack_cnt = 1;
- }
-
- /*
- * We passed data and got it acked, remove any soft error
- * log. Something worked...
- */
-
- sk->err_soft = 0;
-
- /*
- * If this ack opens up a zero window, clear backoff. It was
- * being used to time the probes, and is probably far higher than
- * it needs to be for normal retransmission.
- */
-
- if (sk->ip_xmit_timeout == TIME_PROBE0)
- {
- sk->retransmits = 0; /* Our probe was answered */
-
- /*
- * Was it a usable window open ?
- */
-
- if (!skb_queue_empty(&sk->write_queue) && /* should always be true */
- ! before (sk->window_seq, sk->write_queue.next->end_seq))
+ else
{
- sk->backoff = 0;
-
- /*
- * Recompute rto from rtt. this eliminates any backoff.
- */
-
- /*
- * Appendix C of Van Jacobson's final version of
- * the SIGCOMM 88 paper states that although
- * the original paper suggested that
- * RTO = R*2V
- * was the correct calculation experience showed
- * better results using
- * RTO = R*4V
- * In particular this gives better performance over
- * slow links, and should not effect fast links.
- *
- * Note: Jacobson's algorithm is fine on BSD which
- * has a 1/2 second granularity clock, but with our
- * 1/100 second granularity clock we become too
- * sensitive to minor changes in the round trip time.
- * We add in two compensating factors.
- * First we multiply by 5/4. For large congestion
- * windows this allows us to tolerate burst traffic
- * delaying up to 1/4 of our packets.
- * We also add in a rtt / cong_window term.
- * For small congestion windows this allows
- * a single packet delay, but has negligible effect
- * on the compensation for large windows.
- */
- sk->rto = (sk->rtt >> 3) + sk->mdev;
- sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
- if (sk->rto > 120*HZ)
- sk->rto = 120*HZ;
- if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
- .2 of a second because of BSD delayed acks - on a 100Mb/sec link
- .2 of a second is going to need huge windows (SIGH) */
- sk->rto = HZ/5;
+ tcp_reset_xmit_timer(sk, TIME_PROBE0,
+ min(tp->rto << tp->backoff,
+ 120*HZ));
}
}
/*
* See if we can take anything off of the retransmit queue.
*/
+
+ start_bh_atomic();
- for (;;) {
- struct sk_buff * skb = sk->send_head;
- if (!skb)
- break;
-
+ while(((skb=skb_peek(&sk->write_queue)) != NULL) &&
+ (skb != tp->send_head))
+ {
/* Check for a bug. */
- if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
- printk("INET: tcp.c: *** bug send_list out of order.\n");
-
+
+ if (skb->next != (struct sk_buff*) &sk->write_queue &&
+ after(skb->end_seq, skb->next->seq))
+ printk("INET: tcp_input.c: *** "
+ "bug send_list out of order.\n");
+
/*
* If our packet is before the ack sequence we can
- * discard it as it's confirmed to have arrived the other end.
+ * discard it as it's confirmed to have arrived the
+ * other end.
*/
- if (after(skb->end_seq, ack))
- break;
-
- if (sk->retransmits)
+ if (!after(skb->end_seq, ack))
{
- /*
- * We were retransmitting. don't count this in RTT est
- */
- flag |= 2;
- }
+ if (sk->debug)
+ {
+ printk(KERN_DEBUG "removing seg %x-%x from "
+ "retransmit queue\n",
+ skb->seq, skb->end_seq);
+ }
+
+ tp->retrans_head = NULL;
+
+ flag |= FLAG_DATA_ACKED;
+ seq = skb->seq;
+ seq_rtt = jiffies - skb->when;
+
+ skb_unlink(skb);
+ atomic_dec(&sk->packets_out);
+ skb->free = 1;
- if ((sk->send_head = skb->link3) == NULL)
+ kfree_skb(skb, FREE_WRITE);
+
+ if (!sk->dead)
+ sk->write_space(sk);
+ }
+ else
{
- sk->send_tail = NULL;
- sk->send_next = NULL;
- sk->retransmits = 0;
+ break;
}
+ }
- /*
- * advance the send_next pointer if needed.
- */
- if (sk->send_next == skb)
- sk->send_next = sk->send_head;
+ end_bh_atomic();
+
+ /*
+ * if we where retransmiting don't count rtt estimate
+ */
+ if (sk->retransmits)
+ {
+ if (sk->packets_out == 0)
+ sk->retransmits = 0;
+ }
+ else
+ {
/*
* Note that we only reset backoff and rto in the
* rtt recomputation code. And that doesn't happen
@@ -946,274 +699,89 @@
* suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
- /*
- * We have one less packet out there.
- */
-
- if (sk->packets_out > 0)
- sk->packets_out --;
-
- /* This is really only supposed to be called when we
- * are actually ACKing new data, which should exclude
- * the ACK handshake on an initial SYN packet as well.
- * Rather than introducing a new test here for this
- * special case, we just reset the initial values for
- * rtt immediately after we move to the established state.
- */
- if (!(flag&2)) /* Not retransmitting */
- tcp_rtt_estimator(sk,skb);
- IS_SKB(skb);
-
- /*
- * We may need to remove this from the dev send list.
- */
- cli();
- if (skb->next)
- skb_unlink(skb);
- sti();
- kfree_skb(skb, FREE_WRITE); /* write. */
- if (!sk->dead)
- sk->write_space(sk);
- }
-
- /*
- * Maybe we can take some stuff off of the write queue,
- * and put it onto the xmit queue.
- * There is bizarre case being tested here, to check if
- * the data at the head of the queue ends before the start of
- * the sequence we already ACKed. This is not an error,
- * it can occur when we send a packet directly off of the write_queue
- * in a zero window probe.
- */
-
- if (!skb_queue_empty(&sk->write_queue) &&
- !before(sk->window_seq, sk->write_queue.next->end_seq) &&
- (sk->retransmits == 0 ||
- sk->ip_xmit_timeout != TIME_WRITE ||
- !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq)) &&
- sk->packets_out < sk->cong_window)
- {
- /*
- * Add more data to the send queue.
- */
- tcp_write_xmit(sk);
- }
-
- /*
- * Reset timers to reflect the new state.
- *
- * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
- * from TCP_CLOSE we don't do anything
- *
- * from anything else, if there is queued data (or fin) pending,
- * we use a TIME_WRITE timeout, if there is data to write but
- * no room in the window we use TIME_PROBE0, else if keepalive
- * we reset to a KEEPALIVE timeout, else we delete the timer.
- *
- * We do not set flag for nominal write data, otherwise we may
- * force a state where we start to write itsy bitsy tidbits
- * of data.
- */
-
- switch(sk->state) {
- case TCP_TIME_WAIT:
- /*
- * keep us in TIME_WAIT until we stop getting packets,
- * reset the timeout.
- */
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- break;
- case TCP_CLOSE:
- /*
- * don't touch the timer.
- */
- break;
- default:
- /*
- * Must check send_head and write_queue
- * to determine which timeout to use.
- */
- if (sk->send_head) {
- tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- } else if (!skb_queue_empty(&sk->write_queue)
- && sk->ack_backlog == 0)
+ if (flag & FLAG_DATA_ACKED)
{
- /*
- * if the write queue is not empty when we get here
- * then we failed to move any data to the retransmit
- * queue above. (If we had send_head would be non-NULL).
- * Furthermore, since the send_head is NULL here
- * we must not be in retransmit mode at this point.
- * This implies we have no packets in flight,
- * hence sk->packets_out < sk->cong_window.
- * Examining the conditions for the test to move
- * data to the retransmission queue we find that
- * we must therefore have a zero window.
- * Hence, if the ack_backlog is 0 we should initiate
- * a zero probe.
- * We don't do a zero probe if we have a delayed
- * ACK in hand since the other side may have a
- * window opening, but they are waiting to hear
- * from us before they tell us about it.
- * (They are applying Nagle's rule).
- * So, we don't set up the zero window probe
- * just yet. We do have to clear the timer
- * though in this case...
- */
- tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
- } else if (sk->keepopen) {
- tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
- } else {
- del_timer(&sk->retransmit_timer);
- sk->ip_xmit_timeout = 0;
+ tcp_rtt_estimator(tp, seq_rtt);
+ if (sysctl_tcp_vegas_cong_avoidance)
+ {
+ tcp_cong_avoid_vegas(sk, seq, ack, seq_rtt);
+ }
+ else
+ {
+ tcp_cong_avoid_vanj(sk, seq, ack, seq_rtt);
+ }
}
- break;
- }
-
- /*
- * We have nothing queued but space to send. Send any partial
- * packets immediately (end of Nagle rule application).
- */
-
- if (sk->packets_out == 0
- && sk->partial != NULL
- && skb_queue_empty(&sk->write_queue)
- && sk->send_head == NULL)
- {
- tcp_send_partial(sk);
}
- /*
- * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
- * we are now waiting for an acknowledge to our FIN. The other end is
- * already in TIME_WAIT.
- *
- * Move to TCP_CLOSE on success.
- */
+
- if (sk->state == TCP_LAST_ACK)
+ /* Sanity check out packets_out counter */
+ if (skb_queue_len(&sk->write_queue) == 0 ||
+ ack == tp->snd_nxt )
{
- if (!sk->dead)
- sk->state_change(sk);
- if(sk->debug)
- printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
- sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
- if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
+ if (sk->packets_out)
{
- sk->shutdown = SHUTDOWN_MASK;
- tcp_set_state(sk,TCP_CLOSE);
- return 1;
- }
+ printk(KERN_DEBUG "tcp_ack: packets_out %d\n",
+ sk->packets_out);
+ sk->packets_out = 0;
+ }
}
- /*
- * Incoming ACK to a FIN we sent in the case of our initiating the close.
- *
- * Move to FIN_WAIT2 to await a FIN from the other end. Set
- * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
- */
- if (sk->state == TCP_FIN_WAIT1)
+ if (sk->packets_out)
{
-
- if (!sk->dead)
- sk->state_change(sk);
- if (sk->rcv_ack_seq == sk->write_seq)
+ if (flag & FLAG_DATA_ACKED)
{
- sk->shutdown |= SEND_SHUTDOWN;
- tcp_set_state(sk, TCP_FIN_WAIT2);
- /* If the socket is dead, then there is no
- * user process hanging around using it.
- * We want to set up a FIN_WAIT2 timeout ala BSD.
- */
- if (sk->dead)
- tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
+ long when;
+
+ skb = skb_peek(&sk->write_queue);
+
+ when = tp->rto - (jiffies - skb->when);
+
+ if (when <= 0)
+ {
+ tp->retrans_head = NULL;
+ /*
+ * This is tricky. We are retransmiting a
+ * segment of a window when congestion occured.
+ */
+ tcp_do_retransmit(sk, 0);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS,
+ tp->rto);
+ }
+ else
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
}
}
+ else
+ tcp_clear_xmit_timer(sk, TIME_RETRANS);
+
/*
- * Incoming ACK to a FIN we sent in the case of a simultaneous close.
- *
- * Move to TIME_WAIT
+ * Remember the highest ack received.
*/
+
+ tp->snd_una = ack;
+
+ tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE)));
- if (sk->state == TCP_CLOSING)
- {
- if (!sk->dead)
- sk->state_change(sk);
- if (sk->rcv_ack_seq == sk->write_seq)
- {
- tcp_time_wait(sk);
- }
- }
-
- /*
- * Final ack of a three way shake
- */
-
- if (sk->state==TCP_SYN_RECV)
- {
- tcp_set_state(sk, TCP_ESTABLISHED);
- tcp_options(sk,th);
- sk->dummy_th.dest=th->source;
- sk->copied_seq = sk->acked_seq;
- if(!sk->dead)
- sk->state_change(sk);
- if(sk->max_window==0)
- {
- sk->max_window=32; /* Sanity check */
- sk->mss=min(sk->max_window,sk->mtu);
- }
- /* Reset the RTT estimator to the initial
- * state rather than testing to avoid
- * updating it on the ACK to the SYN packet.
- */
- sk->rtt = 0;
- sk->rto = TCP_TIMEOUT_INIT;
- sk->mdev = TCP_TIMEOUT_INIT;
- }
-
/*
- * The following code has been greatly simplified from the
- * old hacked up stuff. The wonders of properly setting the
- * retransmission timeouts.
- *
- * If we are retransmitting, and we acked a packet on the retransmit
- * queue, and there is still something in the retransmit queue,
- * then we can output some retransmission packets.
+ * Maybe we can take some stuff off of the write queue,
+ * and put it onto the xmit queue.
*/
- if (sk->send_head != NULL && (flag&2) && sk->retransmits)
- {
- tcp_do_retransmit(sk, 1);
- }
return 1;
uninteresting_ack:
+
+ tcp_fast_retrans(sk, ack, 0);
+
if(sk->debug)
- printk("Ack ignored %u %u\n",ack,sk->sent_seq);
+ printk("Ack ignored %u %u\n",ack,tp->snd_nxt);
- /*
- * Keepalive processing.
- */
-
- if (after(ack, sk->sent_seq))
- {
- return 0;
- }
-
- /*
- * Restart the keepalive timer.
- */
-
- if (sk->keepopen)
- {
- if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
- tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
- }
- return 1;
+ return 0;
}
@@ -1237,6 +805,8 @@
{
sk->fin_seq = skb->end_seq;
+ tcp_send_ack(sk);
+
if (!sk->dead)
{
sk->state_change(sk);
@@ -1249,10 +819,11 @@
case TCP_SYN_SENT:
case TCP_ESTABLISHED:
/*
- * move to CLOSE_WAIT, tcp_data() already handled
- * sending the ack.
+ * move to CLOSE_WAIT
*/
- tcp_set_state(sk,TCP_CLOSE_WAIT);
+
+ tcp_set_state(sk, TCP_CLOSE_WAIT);
+
if (th->rst)
sk->shutdown = SHUTDOWN_MASK;
break;
@@ -1280,27 +851,11 @@
* This causes a WRITE timeout, which will either
* move on to TIME_WAIT when we timeout, or resend
* the FIN properly (maybe we get rid of that annoying
- * FIN lost hang). The TIME_WRITE code is already correct
- * for handling this timeout.
+ * FIN lost hang). The TIME_WRITE code is already
+ * correct for handling this timeout.
*/
- if (sk->ip_xmit_timeout != TIME_WRITE) {
- if (sk->send_head)
- tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- else if (sk->ip_xmit_timeout != TIME_PROBE0
- || skb_queue_empty(&sk->write_queue)) {
- /* BUG check case.
- * We have a problem here if there
- * is no timer running [leads to
- * frozen socket] or no data in the
- * write queue [means we sent a fin
- * and lost it from the queue before
- * changing the ack properly].
- */
- printk(KERN_ERR "Lost timer or fin packet in tcp_fin.\n");
- }
- }
- tcp_set_state(sk,TCP_CLOSING);
+ tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/*
@@ -1326,156 +881,176 @@
return(0);
}
-/*
- * Add a sk_buff to the TCP receive queue, calculating
- * the ACK sequence as we go..
- */
-static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
-{
- struct sk_buff * prev, * next;
- u32 seq;
+
/*
- * Find where the new skb goes.. (This goes backwards,
- * on the assumption that we get the packets in order)
+ * This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue
*/
- seq = skb->seq;
- prev = list->prev;
- next = (struct sk_buff *) list;
- for (;;) {
- if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
+
+static __inline__ void tcp_ofo_queue(struct sock *sk)
+{
+ struct sk_buff * skb;
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+
+ while ((skb = skb_peek(&sk->out_of_order_queue))) {
+
+ if (after(skb->seq, tp->rcv_nxt))
break;
- next = prev;
- prev = prev->prev;
+
+ if (!after(skb->end_seq, tp->rcv_nxt)) {
+
+ if (sk->debug)
+ printk("ofo packet was allready received \n");
+
+ skb_unlink(skb);
+ kfree_skb(skb, FREE_READ);
+
+ continue;
+ }
+
+ if (sk->debug)
+ printk("ofo requeuing : rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, skb->seq, skb->end_seq);
+
+ skb_unlink(skb);
+
+
+ skb_queue_tail(&sk->receive_queue, skb);
+
+
+ tp->rcv_nxt = skb->end_seq;
}
- __skb_insert(skb, prev, next, list);
}
-/*
- * Called for each packet when we find a new ACK endpoint sequence in it
- */
-static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
+static __inline__ void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
+ struct sk_buff * skb1;
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+
/*
- * When we ack the fin, we do the FIN
- * processing.
+ * Queue data for delivery to the user
+ * Packets in sequence go to the receive queue
+ * Out of sequence packets to out_of_order_queue
*/
- skb->acked = 1;
- if (skb->h.th->fin)
- tcp_fin(skb,sk,skb->h.th);
- return skb->end_seq;
-}
-static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
-{
- u32 ack_seq;
- tcp_insert_skb(skb, &sk->receive_queue);
+ if (skb->seq == tp->rcv_nxt) {
+
+ /*
+ * Ok. In sequence.
+ */
+
+
+ skb_queue_tail(&sk->receive_queue, skb);
+
+
+ tp->rcv_nxt = skb->end_seq;
+
+ tcp_ofo_queue(sk);
+
+ if (skb_queue_len(&sk->out_of_order_queue) == 0)
+ tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
+ return;
+ }
+
/*
- * Did we get anything new to ack?
+ * Not in sequence
+ * either a retransmit or some packet got lost
*/
- ack_seq = sk->acked_seq;
+ if (!after(skb->end_seq, tp->rcv_nxt)) {
+
+ /*
+ * A retransmit.
+ * 2nd most common case.
+ * force an imediate ack
+ */
- if (!after(skb->seq, ack_seq)) {
- if (after(skb->end_seq, ack_seq)) {
- /* the packet straddles our window end */
- struct sk_buff_head * list = &sk->receive_queue;
- struct sk_buff * next;
- ack_seq = tcp_queue_ack(skb, sk);
+ if (sk->debug)
+ printk("retransmit received: seq %X\n", skb->seq);
- /*
- * Do we have any old packets to ack that the above
- * made visible? (Go forward from skb)
- */
- next = skb->next;
- while (next != (struct sk_buff *) list) {
- if (after(next->seq, ack_seq))
- break;
- if (after(next->end_seq, ack_seq))
- ack_seq = tcp_queue_ack(next, sk);
- next = next->next;
- }
+ sk->delayed_acks = MAX_DELAY_ACK;
+ kfree_skb(skb, FREE_READ);
- /*
- * Ok, we found new data, update acked_seq as
- * necessary (and possibly send the actual
- * ACK packet).
- */
- sk->acked_seq = ack_seq;
+ return;
+ }
- } else {
- if (sk->debug)
- printk("Ack duplicate packet.\n");
- tcp_send_ack(sk);
- return;
- }
+ if (before(skb->seq, tp->rcv_nxt)) {
/*
- * Delay the ack if possible. Send ack's to
- * fin frames immediately as there shouldn't be
- * anything more to come.
+ * Partial packet
+ * seq < rcv_next < end_seq
*/
- if (!sk->delay_acks || th->fin) {
- tcp_send_ack(sk);
- } else {
- /*
- * If psh is set we assume it's an
- * interactive session that wants quick
- * acks to avoid nagling too much.
- */
- int delay = HZ/2;
- if (th->psh)
- delay = HZ/50;
- tcp_send_delayed_ack(sk, delay, sk->ato);
- }
- /*
- * Tell the user we have some more data.
- */
+ if (sk->debug)
+ printk("partial packet: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, skb->seq, skb->end_seq);
+
+ skb_queue_tail(&sk->receive_queue, skb);
- if (!sk->dead)
- sk->data_ready(sk,0);
- }
- else
- {
- /*
- * If we've missed a packet, send an ack.
- * Also start a timer to send another.
- *
- * 4.3reno machines look for these kind of acks so
- * they can do fast recovery. Three identical 'old'
- * acks lets it know that one frame has been lost
- * and should be resent. Because this is before the
- * whole window of data has timed out it can take
- * one lost frame per window without stalling.
- * [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
- *
- * We also should be spotting triple bad sequences.
- * [We now do this.]
- *
- */
-
- if (!skb->acked)
- {
- if(sk->debug)
- printk("Ack past end of seq packet.\n");
- tcp_send_ack(sk);
- /*
- * We need to be very careful here. We must
- * not violate Jacobsons packet conservation condition.
- * This means we should only send an ACK when a packet
- * leaves the network. We can say a packet left the
- * network when we see a packet leave the network, or
- * when an rto measure expires.
- */
- tcp_send_delayed_ack(sk,sk->rto,sk->rto);
- }
- }
-}
+ tp->rcv_nxt = skb->end_seq;
+
+ tcp_ofo_queue(sk);
+
+ if (skb_queue_len(&sk->out_of_order_queue) == 0)
+ tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
+
+ return;
+ }
+
+ /*
+ * Ok. This is an out_of_order segment
+ */
+
+ /* Force an ack */
+
+ sk->delayed_acks = MAX_DELAY_ACK;
+
+ /*
+ * disable header predition
+ */
+
+ tp->pred_flags = 0;
+
+ if (sk->debug)
+ printk("out of order segment: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, skb->seq, skb->end_seq);
+
+ if (skb_peek(&sk->out_of_order_queue) == NULL) {
+ skb_queue_head(&sk->out_of_order_queue,skb);
+ }
+ else
+ for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
+
+ /* allready there */
+ if (skb->seq==skb1->seq && skb->len>=skb1->len)
+ {
+ skb_append(skb1,skb);
+ skb_unlink(skb1);
+ kfree_skb(skb1,FREE_READ);
+ break;
+ }
+
+ if (after(skb->seq, skb1->seq))
+ {
+ skb_append(skb1,skb);
+ break;
+ }
+
+ /*
+ * See if we've hit the start. If so insert.
+ */
+ if (skb1 == skb_peek(&sk->out_of_order_queue)) {
+ skb_queue_head(&sk->out_of_order_queue,skb);
+ break;
+ }
+ }
+
+}
/*
@@ -1484,117 +1059,124 @@
* room, then we will just have to discard the packet.
*/
-static int tcp_data(struct sk_buff *skb, struct sock *sk,
- unsigned long saddr, unsigned int len)
+static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
{
struct tcphdr *th;
- u32 new_seq, shut_seq;
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
th = skb->h.th;
skb_pull(skb,th->doff*4);
skb_trim(skb,len-(th->doff*4));
+ if (skb->len == 0 && !th->fin)
+ {
+ return(0);
+ }
+
/*
- * The bytes in the receive read/assembly queue has increased. Needed for the
- * low memory discard algorithm
+ * FIXME: don't accept data after the receved fin
+ */
+
+ /*
+ * The bytes in the receive read/assembly queue has increased.
+ * Needed for the low memory discard algorithm
*/
sk->bytes_rcv += skb->len;
-
- if (skb->len == 0 && !th->fin)
+
+ /*
+ * We no longer have anyone receiving data on this connection.
+ */
+
+ tcp_data_queue(sk, skb);
+
+ if (before(tp->rcv_nxt, sk->copied_seq))
{
- /*
- * Don't want to keep passing ack's back and forth.
- * (someone sent us dataless, boring frame)
- */
- if (!th->ack)
- tcp_send_ack(sk);
- kfree_skb(skb, FREE_READ);
- return(0);
+ printk("*** tcp.c:tcp_data bug acked < copied\n");
+ tp->rcv_nxt = sk->copied_seq;
}
+ sk->delayed_acks++;
+
/*
- * We no longer have anyone receiving data on this connection.
+ * Now tell the user we may have some data.
*/
+
+ if (!sk->dead)
+ {
+ if(sk->debug)
+ printk("Data wakeup.\n");
+ sk->data_ready(sk,0);
+ }
+ return(1);
+}
-#ifndef TCP_DONT_RST_SHUTDOWN
+static void tcp_data_snd_check(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
- if(sk->shutdown & RCV_SHUTDOWN)
+ if ((skb = tp->send_head))
{
- /*
- * FIXME: BSD has some magic to avoid sending resets to
- * broken 4.2 BSD keepalives. Much to my surprise a few non
- * BSD stacks still have broken keepalives so we want to
- * cope with it.
- */
-
- if(skb->len) /* We don't care if it's just an ack or
- a keepalive/window probe */
+ if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
+ sk->packets_out < sk->cong_window )
{
- new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
-
- /* Do this the way 4.4BSD treats it. Not what I'd
- regard as the meaning of the spec but it's what BSD
- does and clearly they know everything 8) */
-
/*
- * This is valid because of two things
- *
- * a) The way tcp_data behaves at the bottom.
- * b) A fin takes effect when read not when received.
+ * Add more data to the send queue.
*/
-
- shut_seq = sk->acked_seq+1; /* Last byte */
-
- if(after(new_seq,shut_seq))
- {
- if(sk->debug)
- printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
- sk, new_seq, shut_seq, sk->blog);
- if(sk->dead)
- {
- sk->acked_seq = new_seq + th->fin;
- tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
- sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
- tcp_statistics.TcpEstabResets++;
- sk->err = EPIPE;
- sk->error_report(sk);
- sk->shutdown = SHUTDOWN_MASK;
- tcp_set_state(sk,TCP_CLOSE);
- kfree_skb(skb, FREE_READ);
- return 0;
- }
- }
+
+ tcp_write_xmit(sk);
+ wake_up_interruptible(sk->sleep);
}
+ else if (sk->packets_out == 0 && !tp->pending)
+ {
+ /*
+ * Data to queue but no room.
+ */
+ tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+ }
}
+}
-#endif
-
+static __inline__ void tcp_ack_snd_check(struct sock *sk)
+{
/*
- * We should only call this if there is data in the frame.
- */
- tcp_delack_estimator(sk);
+ * This also takes care of updating the window.
+ * This if statement needs to be simplified.
+ *
+ * rules for delaying an ack:
+ * - delay time <= 0.5 HZ
+ * - we don't have a window update to send
+ * - must send at least every 2 full sized packets
+ */
- tcp_queue(skb, sk, th);
+ if (sk->delayed_acks == 0)
+ return;
- return(0);
+ if (sk->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk))
+ {
+ tcp_send_ack(sk);
+ }
+ else
+ {
+ tcp_send_delayed_ack(sk, HZ/2);
+ }
}
-
/*
* This routine is only called when we have urgent data
* signalled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961.
- *
* For 1003.1g we should support a new option TCP_STDURG to permit
* either form.
*/
static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
u32 ptr = ntohs(th->urg_ptr);
if (ptr)
@@ -1628,6 +1210,9 @@
sk->copied_seq++; /* Move the copied sequence on correctly */
sk->urg_data = URG_NOTYET;
sk->urg_seq = ptr;
+
+ /* disable header prediction */
+ tp->pred_flags = 0;
}
/*
@@ -1662,429 +1247,430 @@
}
}
-/*
- * This should be a bit smarter and remove partially
- * overlapping stuff too, but this should be good
- * enough for any even remotely normal case (and the
- * worst that can happen is that we have a few
- * unnecessary packets in the receive queue).
- *
- * This function is never called with an empty list..
- */
-static inline void tcp_remove_dups(struct sk_buff_head * list)
-{
- struct sk_buff * next = list->next;
-
- for (;;) {
- struct sk_buff * skb = next;
- next = next->next;
- if (next == (struct sk_buff *) list)
- break;
- if (before(next->end_seq, skb->end_seq)) {
- __skb_unlink(next, list);
- kfree_skb(next, FREE_READ);
- next = skb;
- continue;
- }
- if (next->seq != skb->seq)
- continue;
- __skb_unlink(skb, list);
- kfree_skb(skb, FREE_READ);
- }
-}
-/*
- * Throw out all unnecessary packets: we've gone over the
- * receive queue limit. This shouldn't happen in a normal
- * TCP connection, but we might have gotten duplicates etc.
- */
-static void prune_queue(struct sk_buff_head * list)
+static __inline__ void prune_queue(struct sock *sk)
{
- for (;;) {
- struct sk_buff * skb = list->prev;
+ struct sk_buff * skb;
- /* gone through it all? */
- if (skb == (struct sk_buff *) list)
- break;
- if (!skb->acked) {
- __skb_unlink(skb, list);
- kfree_skb(skb, FREE_READ);
- continue;
- }
- tcp_remove_dups(list);
- break;
+ /*
+ * clean the out_of_order queue
+ */
+
+ while ((skb = skb_dequeue(&sk->out_of_order_queue)))
+ {
+ kfree_skb(skb, FREE_READ);
}
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/*
- * Check whether a received TCP packet might be for one of our
- * connections.
- */
-int tcp_chkaddr(struct sk_buff *skb)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, __u16 len)
{
- struct iphdr *iph = skb->h.iph;
- struct tcphdr *th = (struct tcphdr *)(skb->h.raw + iph->ihl*4);
- struct sock *sk;
-
- sk = get_sock(&tcp_prot, th->dest, iph->saddr, th->source, iph->daddr, 0, 0);
-
- if (!sk) return 0;
- /* 0 means accept all LOCAL addresses here, not all the world... */
- if (sk->rcv_saddr == 0) return 0;
- return 1;
-}
-#endif
-
-/*
- * A TCP packet has arrived.
- * skb->h.raw is the TCP header.
- */
-
-int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
- __u32 daddr, unsigned short len,
- __u32 saddr, int redo, struct inet_protocol * protocol)
-{
- struct tcphdr *th;
- struct sock *sk;
- __u32 seq;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- int r;
-#endif
+ struct tcp_opt *tp;
+ int queued = 0;
+ u32 flg;
+
+ /*
+ * Header prediction.
+ * The code follows the one in the famous
+ * "30 instruction TCP receive" Van Jacobson mail.
+ *
+ * Van's trick is to deposit buffers into socket queue
+ * on a device interrupt, to call tcp_recv function
+ * on the receive process context and checksum and copy
+ * the buffer to user space. smart...
+ *
+ * Our current scheme is not silly either but we take the
+ * extra cost of the net_bh soft interrupt processing...
+ * We do checksum and copy also but from device to kernel.
+ */
+ tp = &(sk->tp_pinfo.af_tcp);
+ flg = *(((u32 *)th) + 3);
+
/*
- * "redo" is 1 if we have already seen this skb but couldn't
- * use it at that time (the socket was locked). In that case
- * we have already done a lot of the work (looked up the socket
- * etc).
+ * pred_flags is 0x5?10 << 16 + snd_wnd
+ * if header_predition is to be made
+ * ? will be 0 else it will be !0
+ * (when there are holes in the receive
+ * space for instance)
*/
- th = skb->h.th;
- sk = skb->sk;
- if (!redo) {
- tcp_statistics.TcpInSegs++;
- if (skb->pkt_type!=PACKET_HOST)
- goto discard_it;
- /*
- * Pull up the IP header.
- */
-
- skb_pull(skb, skb->h.raw-skb->data);
+ if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt)
+ {
+ if (len <= sizeof(struct tcphdr))
+ {
+ if (len == sizeof(struct tcphdr))
+ {
+ tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+ }
- /*
- * Try to use the device checksum if provided.
- */
- switch (skb->ip_summed)
+ tcp_data_snd_check(sk);
+
+ kfree_skb(skb, FREE_READ);
+ return;
+
+ }
+ else if (skb->ack_seq == tp->snd_una)
{
- case CHECKSUM_NONE:
- skb->csum = csum_partial((char *)th, len, 0);
- case CHECKSUM_HW:
- if (tcp_check(th, len, saddr, daddr, skb->csum))
- goto discard_it;
- default:
- /* CHECKSUM_UNNECESSARY */
- }
- sk = get_tcp_sock(saddr, th->source, daddr, th->dest, dev->pa_addr, skb->redirport);
- if (!sk)
- goto no_tcp_socket;
- skb->sk = sk;
- skb->seq = ntohl(th->seq);
- skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
- skb->ack_seq = ntohl(th->ack_seq);
-
- skb->acked = 0;
- skb->used = 0;
- skb->free = 1;
- skb->saddr = daddr;
- skb->daddr = saddr;
+ /*
+ * Bulk data transfer: receiver
+ */
+
+ skb_pull(skb,sizeof(struct tcphdr));
+
+ skb_queue_tail(&sk->receive_queue, skb);
+ tp->rcv_nxt = skb->end_seq;
+ sk->bytes_rcv += len - sizeof(struct tcphdr);
+
+ sk->data_ready(sk, 0);
+ tcp_delack_estimator(tp);
- /*
- * We may need to add it to the backlog here.
- */
- if (sk->users)
+ if (sk->delayed_acks++)
+ {
+ tcp_send_delayed_ack(sk, HZ/2);
+ }
+ else
+ tcp_send_ack(sk);
+
+ return;
+ }
+ }
+
+ if (!tcp_sequence(tp, skb->seq, skb->end_seq))
+ {
+ if (!th->rst)
{
- __skb_queue_tail(&sk->back_log, skb);
- return(0);
+ if (after(skb->seq, tp->rcv_nxt))
+ {
+ printk(KERN_DEBUG "->seq:%d end:%d "
+ "wup:%d wnd:%d\n",
+ skb->seq, skb->end_seq,
+ tp->rcv_wup, tp->rcv_wnd);
+ }
+ tcp_send_ack(sk);
+ kfree_skb(skb, FREE_READ);
+ return;
}
}
+ if(th->syn && skb->seq != sk->syn_seq)
+ {
+ printk(KERN_DEBUG "syn in established state\n");
+ tcp_reset(sk, skb);
+ kfree_skb(skb, FREE_READ);
+ return;
+ }
+
+ if(th->rst)
+ {
+ tcp_reset(sk,skb);
+ kfree_skb(skb, FREE_READ);
+ return;
+ }
+
+ if(th->ack)
+ {
+ tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+ }
+
+
/*
- * If this socket has got a reset it's to all intents and purposes
- * really dead. Count closed sockets as dead.
- *
- * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
- * simply drops data. This seems incorrect as a 'closed' TCP doesn't
- * exist so should cause resets as if the port was unreachable.
+ * Process urgent data
*/
- if (sk->zapped || sk->state==TCP_CLOSE)
- goto no_tcp_socket;
+ tcp_urg(sk, th, len);
- if (!sk->prot)
+ /*
+ * step 7: process the segment text
+ */
+
+
+ queued = tcp_data(skb, sk, len);
+
+ /*
+ * step 8: check the FIN bit
+ */
+
+ if (th->fin)
{
- printk(KERN_CRIT "IMPOSSIBLE 3\n");
- return(0);
+ tcp_fin(skb, sk, th);
}
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
/*
- * Charge the memory to the socket.
+ * If our receive queue has grown past its limits,
+ * try to prune away duplicates etc..
*/
-
- skb->sk=sk;
- atomic_add(skb->truesize, &sk->rmem_alloc);
+ if (sk->rmem_alloc > sk->rcvbuf)
+ prune_queue(sk);
/*
- * Mark the time of the last received packet.
- */
- sk->idletime = jiffies;
+ * And done
+ */
+ if (queued)
+ return;
+
+ kfree_skb(skb, FREE_READ);
+}
+
+
+/*
+ * This function implements the receiving procedure of RFC 793.
+ * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ * address independent.
+ */
+
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, void *opt, __u16 len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int queued = 0;
+ int rcv_mss;
+
/*
- * We should now do header prediction.
- */
-
- /*
- * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
- * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
- * compatibility. We also set up variables more thoroughly [Karn notes in the
- * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
+ * state == CLOSED
+ * tested in tcp_v{4,6}_rcv
*/
- if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
- {
-
- /*
- * Now deal with unusual cases.
+ switch (sk->state) {
+
+
+ case TCP_LISTEN:
+
+ if (th->rst)
+ goto discard;
+
+ /*
+ * These use the socket TOS..
+ * might want to be the received TOS
*/
-
- if(sk->state==TCP_LISTEN)
- {
- if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
- tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
+ if(th->ack)
+ {
/*
- * We don't care for RST, and non SYN are absorbed (old segments)
- * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
- * netmask on a running connection it can go broadcast. Even Sun's have
- * this problem so I'm ignoring it
+ * send reset
*/
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /*
- * We may get non-local addresses and still want to
- * handle them locally, due to transparent proxying.
- * Thus, narrow down the test to what is really meant.
- */
- if(th->rst || !th->syn || th->ack || (r = ip_chk_addr(daddr)) == IS_BROADCAST || r == IS_MULTICAST)
-#else
- if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
-#endif
- {
- kfree_skb(skb, FREE_READ);
- return 0;
- }
+
+ return 1;
+ }
- /*
- * Guess we need to make a new socket up
- */
- seq = secure_tcp_sequence_number(saddr, daddr,
- skb->h.th->dest,
- skb->h.th->source);
- tcp_conn_request(sk, skb, daddr, saddr, opt, dev, seq);
- /*
- * Now we have several options: In theory there is nothing else
- * in the frame. KA9Q has an option to send data with the syn,
- * BSD accepts data with the syn up to the [to be] advertised window
- * and Solaris 2.1 gives you a protocol error. For now we just ignore
- * it, that fits the spec precisely and avoids incompatibilities. It
- * would be nice in future to drop through and process the data.
+ if(th->syn)
+ {
+ int err;
+ __u32 isn;
+
+ isn = tp->af_specific->init_sequence(sk, skb);
+ err = tp->af_specific->conn_request(sk, skb, opt, isn);
+
+ if (err < 0)
+ return 1;
+
+ /*
+ * Now we have several options: In theory there is
+ * nothing else in the frame. KA9Q has an option to
+ * send data with the syn, BSD accepts data with the
+ * syn up to the [to be] advertised window and
+ * Solaris 2.1 gives you a protocol error. For now
+ * we just ignore it, that fits the spec precisely
+ * and avoids incompatibilities. It would be nice in
+ * future to drop through and process the data.
*
- * Now TTCP is starting to use we ought to queue this data.
+ * Now that TTCP is starting to be used we ought to
+ * queue this data.
*/
-
- return 0;
- }
-
- /*
- * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
- * then it's a new connection
- */
-
- if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
- {
- kfree_skb(skb, FREE_READ);
+
return 0;
}
+ goto discard;
+ break;
+
+ case TCP_SYN_SENT:
+
/*
- * SYN sent means we have to look for a suitable ack and either reset
- * for bad matches or go to connected. The SYN_SENT case is unusual and should
+ * SYN sent means we have to look for a suitable ack and
+ * either reset for bad matches or go to connected.
+ * The SYN_SENT case is unusual and should
* not be in line code. [AC]
*/
- if(sk->state==TCP_SYN_SENT)
+ if(th->ack)
{
- /* Crossed SYN or previous junk segment */
- if(th->ack)
+ /* We got an ack, but it's not a good ack */
+ if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len))
{
- /* We got an ack, but it's not a good ack.
- * We used to test this with a call to tcp_ack,
- * but this loses, because it takes the SYN
- * packet out of the send queue, even if
- * the ACK doesn't have the SYN bit sent, and
- * therefore isn't the one we are waiting for.
- */
- if (after(skb->ack_seq, sk->sent_seq) || before(skb->ack_seq, sk->rcv_ack_seq))
- {
- /* Reset the ack - it's an ack from a
- different connection [ th->rst is checked in tcp_send_reset()] */
- tcp_statistics.TcpAttemptFails++;
- tcp_send_reset(daddr, saddr, th,
- sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
- kfree_skb(skb, FREE_READ);
- return(0);
- }
- if(th->rst)
- return tcp_reset(sk,skb);
- if(!th->syn)
- {
- /* A valid ack from a different connection
- start. Shouldn't happen but cover it */
- tcp_statistics.TcpAttemptFails++;
- tcp_send_reset(daddr, saddr, th,
- sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
- kfree_skb(skb, FREE_READ);
- return 0;
- }
-
- /* process the ACK, get the SYN packet out
- * of the send queue, do other initial
- * processing stuff. [We know it's good, and
- * we know it's the SYN,ACK we want.]
- */
- tcp_ack(sk,th,skb->ack_seq,len);
-
+ tcp_statistics.TcpAttemptFails++;
+ return 1;
+ }
- /*
- * Ok.. it's good. Set up sequence numbers and
- * move to established.
- */
- sk->acked_seq = skb->seq+1;
- sk->lastwin_seq = skb->seq+1;
- sk->fin_seq = skb->seq;
- tcp_send_ack(sk);
- tcp_set_state(sk, TCP_ESTABLISHED);
- tcp_options(sk,th);
- sk->dummy_th.dest=th->source;
- sk->copied_seq = sk->acked_seq;
- if(!sk->dead)
- {
- sk->state_change(sk);
- sock_wake_async(sk->socket, 0);
- }
- if(sk->max_window==0)
- {
- sk->max_window = 32;
- sk->mss = min(sk->max_window, sk->mtu);
- }
- /* Reset the RTT estimator to the initial
- * state rather than testing to avoid
- * updating it on the ACK to the SYN packet.
- */
- sk->rtt = 0;
- sk->rto = TCP_TIMEOUT_INIT;
- sk->mdev = TCP_TIMEOUT_INIT;
+ if(th->rst)
+ {
+ tcp_reset(sk,skb);
+ goto discard;
}
- else
+
+ if(!th->syn)
{
- /* See if SYN's cross. Drop if boring */
- if(th->syn && !th->rst)
- {
- /* Crossed SYN's are fine - but talking to
- yourself is right out... */
- if(sk->saddr==saddr && sk->daddr==daddr &&
- sk->dummy_th.source==th->source &&
- sk->dummy_th.dest==th->dest)
- {
- tcp_statistics.TcpAttemptFails++;
- return tcp_reset(sk,skb);
- }
- tcp_set_state(sk,TCP_SYN_RECV);
-
- /*
- * FIXME:
- * Must send SYN|ACK here
- */
- }
- /* Discard junk segment */
- kfree_skb(skb, FREE_READ);
- return 0;
+ /*
+ * A valid ack from a different connection
+ * start. Shouldn't happen but cover it
+ */
+ tcp_statistics.TcpAttemptFails++;
+ return 1;
}
+
/*
- * SYN_RECV with data maybe.. drop through
+ * Ok.. it's good. Set up sequence
+ * numbers and
+ * move to established.
*/
- goto rfc_step6;
- }
- /*
- * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
- * a more complex suggestion for fixing these reuse issues in RFC1644
- * but not yet ready for general use. Also see RFC1379.
- *
- * Note the funny way we go back to the top of this function for
- * this case ("goto try_next_socket"). That also takes care of
- * checking "sk->users" for the new socket as well as doing all
- * the normal tests on the packet.
- */
-
-#define BSD_TIME_WAIT
-#ifdef BSD_TIME_WAIT
- if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
- after(skb->seq, sk->acked_seq) && !th->rst)
- {
- u32 seq = sk->write_seq;
- if(sk->debug)
- printk("Doing a BSD time wait\n");
- tcp_statistics.TcpEstabResets++;
- atomic_sub(skb->truesize, &sk->rmem_alloc);
- skb->sk = NULL;
- sk->err=ECONNRESET;
- tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr, dev->pa_addr, skb->redirport);
- /* this is not really correct: we should check sk->users */
- if (sk && sk->state==TCP_LISTEN)
- {
- skb->sk = sk;
- atomic_add(skb->truesize, &sk->rmem_alloc);
- tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
- return 0;
+ tp->rcv_nxt = skb->seq+1;
+ tp->rcv_wnd = 0;
+ tp->rcv_wup = skb->seq+1;
+
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = skb->seq;
+ tp->snd_wl2 = skb->ack_seq;
+
+ sk->fin_seq = skb->seq;
+ tcp_send_ack(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ rcv_mss = tcp_parse_options(th);
+
+ if (rcv_mss == 0)
+ {
+ rcv_mss = 536;
}
- kfree_skb(skb, FREE_READ);
+
+ sk->mss = min(sk->mss, rcv_mss);
+
+ sk->dummy_th.dest = th->source;
+ sk->copied_seq = tp->rcv_nxt;
+
+ if(!sk->dead)
+ {
+ sk->state_change(sk);
+ sock_wake_async(sk->socket, 0);
+ }
+
+ /* Drop through step 6 */
+ goto step6;
+ }
+ else
+ {
+ if(th->syn && !th->rst)
+ {
+ /*
+ * the previous version of the code
+ * checked for "connecting to self"
+ * here. that check is done now in
+ * tcp_connect
+ */
+
+ tcp_set_state(sk, TCP_SYN_RECV);
+
+ tp->rcv_nxt = skb->seq + 1;
+ tp->rcv_wup = skb->seq + 1;
+
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = skb->seq;
+
+ tcp_send_synack(sk);
+ goto discard;
+ }
+
+ }
+ break;
+
+ case TCP_TIME_WAIT:
+ /*
+ * RFC 1122:
+ * "When a connection is [...] on TIME-WAIT state [...]
+ * [a TCP] MAY accept a new SYN from the remote TCP to
+ * reopen the connection directly, if it:
+ *
+ * (1) assigns its initial sequence number for the new
+ * connection to be larger than the largest sequence
+ * number it used on the previous connection incarnation,
+ * and
+ *
+ * (2) returns to TIME-WAIT state if the SYN turns out
+ * to be an old duplicate".
+ */
+
+ if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt))
+ {
+ __u32 isn;
+ int err;
+
+ atomic_sub(skb->truesize, &sk->rmem_alloc);
+ skb->sk = NULL;
+ sk->err = ECONNRESET;
+ tcp_set_state(sk, TCP_CLOSE);
+ sk->shutdown = SHUTDOWN_MASK;
+
+ isn = tp->rcv_nxt + 128000;
+
+ sk = tp->af_specific->get_sock(skb, th);
+
+ if (sk == NULL)
+ goto discard;
+
+ skb->sk = sk;
+ tp = &sk->tp_pinfo.af_tcp;
+ atomic_add(skb->truesize, &sk->rmem_alloc);
+
+ err = tp->af_specific->conn_request(sk, skb, opt, isn);
+
+ if (err < 0)
+ return 1;
+
return 0;
}
-#endif
+
+ break;
+
}
/*
- * We are now in normal data flow (see the step list in the RFC)
- * Note most of these are inline now. I'll inline the lot when
- * I have time to test it hard and look at what gcc outputs
+ * step 1: check sequence number
*/
- if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
+ if (!tcp_sequence(tp, skb->seq, skb->end_seq))
{
- bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
- kfree_skb(skb, FREE_READ);
- return 0;
+ if (!th->rst)
+ {
+ tcp_send_ack(sk);
+ goto discard;
+ }
}
+
+ /*
+ * step 2: check RST bit
+ */
+
if(th->rst)
- return tcp_reset(sk,skb);
-
+ {
+ tcp_reset(sk,skb);
+ goto discard;
+ }
+
/*
+ * step 3: check security and precedence
+ * [ignored]
+ */
+
+ /*
+ * step 4:
+ *
* Check for a SYN, and ensure it matches the SYN we were
* first sent. We have to handle the rather unusual (but valid)
* sequence that KA9Q derived products may generate of
@@ -2098,77 +1684,152 @@
* We keep syn_seq as the sequence space occupied by the
* original syn.
*/
-
- if(th->syn && skb->seq!=sk->syn_seq)
+
+ if (th->syn && skb->seq!=sk->syn_seq)
{
- tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
- return tcp_reset(sk,skb);
+ tcp_reset(sk, skb);
+ return 1;
}
/*
- * Process the ACK
+ * step 5: check the ACK field
*/
-
- if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
+ if (th->ack)
{
- /*
- * Our three way handshake failed.
- */
-
- if(sk->state==TCP_SYN_RECV)
- {
- tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
+ int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len);
+
+ switch(sk->state) {
+ case TCP_SYN_RECV:
+ if (acceptable)
+ {
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->dummy_th.dest=th->source;
+ sk->copied_seq = tp->rcv_nxt;
+
+ if(!sk->dead)
+ sk->state_change(sk);
+
+ tp->snd_una = skb->ack_seq;
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = skb->seq;
+ tp->snd_wl2 = skb->ack_seq;
+
+ }
+ else
+ return 1;
+ break;
+
+ case TCP_FIN_WAIT1:
+
+ if (tp->snd_una == sk->write_seq)
+ {
+ sk->shutdown |= SEND_SHUTDOWN;
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ if (!sk->dead)
+ sk->state_change(sk);
+ }
+ break;
+
+ case TCP_CLOSING:
+
+ if (tp->snd_una == sk->write_seq)
+ {
+ tcp_time_wait(sk);
+ if (!sk->dead)
+ sk->state_change(sk);
+ }
+ break;
+
+ case TCP_LAST_ACK:
+
+ if (tp->snd_una == sk->write_seq)
+ {
+ sk->shutdown = SHUTDOWN_MASK;
+ tcp_set_state(sk,TCP_CLOSE);
+ if (!sk->dead)
+ sk->state_change(sk);
+ goto discard;
+ }
+ break;
+
+ case TCP_TIME_WAIT:
+ /*
+ * keep us in TIME_WAIT until we stop getting
+ * packets, reset the timeout.
+ */
+ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+ break;
+
}
- kfree_skb(skb, FREE_READ);
- return 0;
}
-
-rfc_step6: /* I'll clean this up later */
+ else
+ goto discard;
- /*
- * If the accepted buffer put us over our queue size we
- * now drop it (we must process the ack first to avoid
- * deadlock cases).
- */
+ step6:
/*
- * Process urgent data
+ * step 6: check the URG bit
*/
-
+
tcp_urg(sk, th, len);
-
- /*
- * Process the encapsulated data
- */
-
- if(tcp_data(skb,sk, saddr, len))
- kfree_skb(skb, FREE_READ);
/*
- * If our receive queue has grown past its limits,
- * try to prune away duplicates etc..
+ * step 7: process the segment text
*/
- if (sk->rmem_alloc > sk->rcvbuf)
- prune_queue(&sk->receive_queue);
- /*
- * And done
- */
+ switch (sk->state) {
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ if (!before(skb->seq, sk->fin_seq))
+ break;
- return 0;
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
-no_tcp_socket:
- /*
- * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
- */
- tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
+ /*
+ * RFC 793 says to queue data in this states,
+ * RFC 1122 says we MUST send a reset.
+ * BSD 4.4 also does reset.
+ */
+
+ if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead)
+ {
+ if (after(skb->end_seq - th->fin, tp->rcv_nxt))
+ {
+ tcp_reset(sk, skb);
+ return 1;
+ }
+ }
+
+ case TCP_ESTABLISHED:
+ queued = tcp_data(skb, sk, len);
+ break;
+ }
-discard_it:
/*
- * Discard frame
+ * step 8: check the FIN bit
*/
- skb->sk = NULL;
+
+ if (th->fin)
+ {
+ tcp_fin(skb, sk, th);
+ }
+
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+
+ if (queued)
+ return 0;
+ discard:
+
kfree_skb(skb, FREE_READ);
return 0;
}
+
+/*
+ * Local variables:
+ * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp_input.o tcp_input.c"
+ * c-file-style: "Linux"
+ * End:
+ */
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov