tcp_input.c
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* Version: @(#)tcp_input.c 1.0.16 05/25/93
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*
* FIXES
* Pedro Roque : Double ACK bug
* Eric Schenk : Fixes to slow start algorithm.
* Eric Schenk : Yet another double ACK bug.
* Eric Schenk : Delayed ACK bug fixes.
* Eric Schenk : Floyd style fast retrans war avoidance.
* Eric Schenk : Skip fast retransmit on small windows.
* Eric schenk : Fixes to retransmission code to
* : avoid extra retransmission.
* Theodore Ts'o : Do secure TCP sequence numbers.
*/
#include <linux/config.h>
#include <linux/types.h>
#include <linux/random.h>
#include <net/tcp.h>
/*
* Policy code extracted so it's now separate
*/
/*
* Called each time to estimate the delayed ack timeout. This is
* how it should be done so a fast link isn't impacted by ack delay.
*/
extern __inline__ void tcp_delack_estimator(struct sock *sk)
{
/*
* Delayed ACK time estimator.
*/
if (sk->lrcvtime == 0)
{
sk->lrcvtime = jiffies;
sk->ato = HZ/3;
}
else
{
int m;
m = jiffies - sk->lrcvtime;
sk->lrcvtime = jiffies;
if (m <= 0)
m = 1;
/* This used to test against sk->rtt.
* On a purely receiving link, there is no rtt measure.
* The result is that we lose delayed ACKs on one-way links.
* Therefore we test against sk->rto, which will always
* at least have a default value.
*/
if (m > sk->rto)
{
sk->ato = sk->rto;
/*
* printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
*/
}
else
{
/*
* Very fast acting estimator.
* May fluctuate too much. Probably we should be
* doing something like the rtt estimator here.
*/
sk->ato = (sk->ato >> 1) + m;
/*
* printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
*/
}
}
}
/*
* Called on frames that were known _not_ to have been
* retransmitted [see Karn/Partridge Proceedings SIGCOMM 87].
* The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
*/
extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
{
long m;
/*
* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
* This is designed to be as fast as possible
* m stands for "measurement".
*/
m = jiffies - oskb->when; /* RTT */
if (sk->rtt != 0) {
if(m<=0)
m=1; /* IS THIS RIGHT FOR <0 ??? */
m -= (sk->rtt >> 3); /* m is now error in rtt est */
sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0)
m = -m; /* m is now abs(error) */
m -= (sk->mdev >> 2); /* similar update on mdev */
sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
} else {
/* no previous measure. */
sk->rtt = m<<3; /* take the measured time to be rtt */
sk->mdev = m<<1; /* make sure rto = 3*rtt */
}
/*
* Now update timeout. Note that this removes any backoff.
*/
/* Jacobson's algorithm calls for rto = R + 4V.
* We diverge from Jacobson's algorithm here. See the commentary
* in tcp_ack to understand why.
*/
sk->rto = (sk->rtt >> 3) + sk->mdev;
sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
if (sk->rto > 120*HZ)
sk->rto = 120*HZ;
if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
sk->rto = HZ/5;
sk->backoff = 0;
}
/*
* Cached last hit socket
*/
static volatile unsigned long th_cache_saddr, th_cache_daddr;
static volatile unsigned short th_cache_dport, th_cache_sport;
static volatile struct sock *th_cache_sk;
void tcp_cache_zap(void)
{
th_cache_sk=NULL;
}
/*
* Find the socket, using the last hit cache if applicable. The cache is not quite
* right...
*/
static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport, u32 paddr, u16 pport)
{
struct sock * sk;
sk = (struct sock *) th_cache_sk;
if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
sport != th_cache_sport || dport != th_cache_dport) {
sk = get_sock(&tcp_prot, dport, saddr, sport, daddr, paddr, pport);
if (sk) {
th_cache_saddr=saddr;
th_cache_daddr=daddr;
th_cache_dport=dport;
th_cache_sport=sport;
th_cache_sk=sk;
}
}
return sk;
}
/*
* React to a out-of-window TCP sequence number in an incoming packet
*/
static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
struct device *dev)
{
if (th->rst)
return;
/*
* Send a reset if we get something not ours and we are
* unsynchronized. Note: We don't do anything to our end. We
* are just killing the bogus remote connection then we will
* connect again and it will work (with luck).
*/
if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
{
tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev,0,255);
return;
}
/*
* This packet is old news. Usually this is just a resend
* from the far end, but sometimes it means the far end lost
* an ACK we sent, so we better send an ACK.
*/
/*
* BEWARE! Unconditional answering by ack to out-of-window ack
* can result in infinite exchange of empty acks.
* This check cures bug, found by Michiel Boland, but
* not another possible cases.
* If we are in TCP_TIME_WAIT, we have already received
* FIN, so that our peer need not window update. If our
* ACK were lost, peer would retransmit his FIN anyway. --ANK
*/
if (sk->state != TCP_TIME_WAIT || ntohl(th->seq) != end_seq)
tcp_send_ack(sk);
}
/*
* This functions checks to see if the tcp header is actually acceptable.
*/
extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
{
u32 end_window = sk->lastwin_seq + sk->window;
return /* if start is at end of window, end must be too (zero window) */
(seq == end_window && seq == end_seq) ||
/* if start is before end of window, check for interest */
(before(seq, end_window) && !before(end_seq, sk->acked_seq));
}
/*
* When we get a reset we do this. This probably is a tcp_output routine
* really.
*/
static int tcp_reset(struct sock *sk, struct sk_buff *skb)
{
sk->zapped = 1;
/*
* We want the right error as BSD sees it (and indeed as we do).
*/
switch (sk->state) {
case TCP_TIME_WAIT:
break;
case TCP_SYN_SENT:
sk->err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
sk->err = EPIPE;
break;
default:
sk->err = ECONNRESET;
}
#ifdef CONFIG_TCP_RFC1337
/*
* Time wait assassination protection [RFC1337]
*
* This is a good idea, but causes more sockets to take time to close.
*
* Ian Heavens has since shown this is an inadequate fix for the protocol
* bug in question.
*/
if(sk->state!=TCP_TIME_WAIT)
{
tcp_set_state(sk,TCP_CLOSE);
sk->shutdown = SHUTDOWN_MASK;
}
#else
tcp_set_state(sk,TCP_CLOSE);
sk->shutdown = SHUTDOWN_MASK;
#endif
if (!sk->dead)
sk->state_change(sk);
kfree_skb(skb, FREE_READ);
return(0);
}
/*
* Look for tcp options. Parses everything but only knows about MSS.
* This routine is always called with the packet containing the SYN.
* However it may also be called with the ack to the SYN. So you
* can't assume this is always the SYN. It's always called after
* we have set up sk->mtu to our own MTU.
*
* We need at minimum to add PAWS support here. Possibly large windows
* as Linux gets deployed on 100Mb/sec networks.
*/
static void tcp_options(struct sock *sk, struct tcphdr *th)
{
unsigned char *ptr;
int length=(th->doff*4)-sizeof(struct tcphdr);
int mss_seen = 0;
ptr = (unsigned char *)(th + 1);
while(length>0)
{
int opcode=*ptr++;
int opsize=*ptr++;
switch(opcode)
{
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
ptr--; /* the opsize=*ptr++ above was a mistake */
continue;
default:
if(opsize<=2) /* Avoid silly options looping forever */
return;
switch(opcode)
{
case TCPOPT_MSS:
if(opsize==4 && th->syn)
{
sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
mss_seen = 1;
}
break;
/* Add other options here as people feel the urge to implement stuff like large windows */
}
ptr+=opsize-2;
length-=opsize;
}
}
if (th->syn)
{
if (! mss_seen)
sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
}
#ifdef CONFIG_INET_PCTCP
sk->mss = min(sk->max_window >> 1, sk->mtu);
#else
sk->mss = min(sk->max_window, sk->mtu);
sk->max_unacked = 2 * sk->mss;
#endif
}
/*
* This routine handles a connection request.
* It should make sure we haven't already responded.
* Because of the way BSD works, we have to send a syn/ack now.
* This also means it will be harder to close a socket which is
* listening.
*/
static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
{
struct sock *newsk;
struct tcphdr *th;
struct rtable *rt;
th = skb->h.th;
/* If the socket is dead, don't accept the connection. */
if (!sk->dead)
{
sk->data_ready(sk,0);
}
else
{
if(sk->debug)
printk("Reset on %p: Connect on dead socket.\n",sk);
tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, 0,255);
tcp_statistics.TcpAttemptFails++;
kfree_skb(skb, FREE_READ);
return;
}
/*
* Make sure we can accept more. This will prevent a
* flurry of syns from eating up all our memory.
*
* BSD does some funnies here and allows 3/2 times the
* set backlog as a fudge factor. That's just too gross.
*/
if (sk->ack_backlog >= sk->max_ack_backlog)
{
tcp_statistics.TcpAttemptFails++;
kfree_skb(skb, FREE_READ);
return;
}
/*
* We need to build a new sock struct.
* It is sort of bad to have a socket without an inode attached
* to it, but the wake_up's will just wake up the listening socket,
* and if the listening socket is destroyed before this is taken
* off of the queue, this will take care of it.
*/
newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
if (newsk == NULL)
{
/* just ignore the syn. It will get retransmitted. */
tcp_statistics.TcpAttemptFails++;
kfree_skb(skb, FREE_READ);
return;
}
memcpy(newsk, sk, sizeof(*newsk));
newsk->opt = NULL;
newsk->ip_route_cache = NULL;
if (opt && opt->optlen)
{
sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
if (!sk->opt)
{
kfree_s(newsk, sizeof(struct sock));
tcp_statistics.TcpAttemptFails++;
kfree_skb(skb, FREE_READ);
return;
}
if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
{
kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
kfree_s(newsk, sizeof(struct sock));
tcp_statistics.TcpAttemptFails++;
kfree_skb(skb, FREE_READ);
return;
}
}
skb->when = jiffies; /* For timeout */
skb_queue_head_init(&newsk->write_queue);
skb_queue_head_init(&newsk->receive_queue);
newsk->send_head = NULL;
newsk->send_tail = NULL;
newsk->send_next = NULL;
skb_queue_head_init(&newsk->back_log);
newsk->rtt = 0;
newsk->rto = TCP_TIMEOUT_INIT;
newsk->mdev = TCP_TIMEOUT_INIT;
newsk->max_window = 0;
/*
* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
newsk->cong_window = 1;
newsk->cong_count = 0;
newsk->ssthresh = 0x7fffffff;
newsk->lrcvtime = 0;
newsk->idletime = 0;
newsk->high_seq = 0;
newsk->backoff = 0;
newsk->blog = 0;
newsk->intr = 0;
newsk->proc = 0;
newsk->done = 0;
newsk->partial = NULL;
newsk->pair = NULL;
newsk->wmem_alloc = 0;
newsk->rmem_alloc = 0;
newsk->localroute = sk->localroute;
newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
newsk->err = 0;
newsk->shutdown = 0;
newsk->ack_backlog = 0;
newsk->acked_seq = skb->seq+1;
newsk->lastwin_seq = skb->seq+1;
newsk->delay_acks = 1;
newsk->copied_seq = skb->seq+1;
newsk->fin_seq = skb->seq;
newsk->syn_seq = skb->seq;
newsk->state = TCP_SYN_RECV;
newsk->timeout = 0;
newsk->ip_xmit_timeout = 0;
newsk->write_seq = seq;
newsk->window_seq = newsk->write_seq;
newsk->rcv_ack_seq = newsk->write_seq;
newsk->urg_data = 0;
newsk->retransmits = 0;
newsk->linger=0;
newsk->destroy = 0;
init_timer(&newsk->timer);
newsk->timer.data = (unsigned long)newsk;
newsk->timer.function = &net_timer;
init_timer(&newsk->delack_timer);
newsk->delack_timer.data = (unsigned long)newsk;
newsk->delack_timer.function = tcp_delack_timer;
init_timer(&newsk->retransmit_timer);
newsk->retransmit_timer.data = (unsigned long)newsk;
newsk->retransmit_timer.function = tcp_retransmit_timer;
newsk->dummy_th.source = skb->h.th->dest;
newsk->dummy_th.dest = skb->h.th->source;
newsk->users=0;
#ifdef CONFIG_IP_TRANSPARENT_PROXY
/*
* Deal with possibly redirected traffic by setting num to
* the intended destination port of the received packet.
*/
newsk->num = ntohs(skb->h.th->dest);
#endif
/*
* Swap these two, they are from our point of view.
*/
newsk->daddr = saddr;
newsk->saddr = daddr;
newsk->rcv_saddr = daddr;
put_sock(newsk->num,newsk);
newsk->acked_seq = skb->seq + 1;
newsk->copied_seq = skb->seq + 1;
newsk->socket = NULL;
/*
* Grab the ttl and tos values and use them
*/
newsk->ip_ttl=sk->ip_ttl;
newsk->ip_tos=skb->ip_hdr->tos;
/*
* Use 512 or whatever user asked for
*/
/*
* Note use of sk->user_mss, since user has no direct access to newsk
*/
rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
newsk->ip_route_cache = rt;
if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
newsk->window_clamp = rt->rt_window;
else
newsk->window_clamp = 0;
if (sk->user_mss)
newsk->mtu = sk->user_mss;
else if (rt)
newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
else
newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
/*
* But not bigger than device MTU
*/
newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
#ifdef CONFIG_SKIP
/*
* SKIP devices set their MTU to 65535. This is so they can take packets
* unfragmented to security process then fragment. They could lie to the
* TCP layer about a suitable MTU, but it's easier to let skip sort it out
* simply because the final package we want unfragmented is going to be
*
* [IPHDR][IPSP][Security data][Modified TCP data][Security data]
*/
if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
sk->mtu=skip_pick_mtu(sk->mtu,dev);
#endif
/*
* This will min with what arrived in the packet
*/
tcp_options(newsk,skb->h.th);
tcp_cache_zap();
tcp_send_synack(newsk, sk, skb);
}
/*
* Handle a TCP window that shrunk on us. It shouldn't happen,
* but..
*
* We may need to move packets from the send queue
* to the write queue, if the window has been shrunk on us.
* The RFC says you are not allowed to shrink your window
* like this, but if the other end does, you must be able
* to deal with it.
*/
void tcp_window_shrunk(struct sock * sk, u32 window_seq)
{
struct sk_buff *skb;
struct sk_buff *skb2;
struct sk_buff *wskb = NULL;
skb2 = sk->send_head;
sk->send_head = NULL;
sk->send_tail = NULL;
sk->send_next = NULL;
/*
* This is an artifact of a flawed concept. We want one
* queue and a smarter send routine when we send all.
*/
cli();
while (skb2 != NULL)
{
skb = skb2;
skb2 = skb->link3;
skb->link3 = NULL;
if (after(skb->end_seq, window_seq))
{
if (sk->packets_out > 0)
sk->packets_out--;
/* We may need to remove this from the dev send list. */
if (skb->next != NULL)
{
skb_unlink(skb);
}
/* Now add it to the write_queue. */
if (wskb == NULL)
skb_queue_head(&sk->write_queue,skb);
else
skb_append(wskb,skb);
wskb = skb;
}
else
{
if (sk->send_head == NULL)
{
sk->send_head = skb;
sk->send_tail = skb;
sk->send_next = skb;
}
else
{
sk->send_tail->link3 = skb;
sk->send_tail = skb;
}
skb->link3 = NULL;
}
}
sti();
}
/*
* This routine deals with incoming acks, but not outgoing ones.
*
* This routine is totally _WRONG_. The list structuring is wrong,
* the algorithm is wrong, the code is wrong.
*/
static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
{
int flag = 0;
u32 window_seq;
/*
* 1 - there was data in packet as well as ack or new data is sent or
* in shutdown state
* 2 - data from retransmit queue was acked and removed
* 4 - window shrunk or data from retransmit queue was acked and removed
*/
if(sk->zapped)
return(1); /* Dead, can't ack any more so why bother */
/*
* We have dropped back to keepalive timeouts. Thus we have
* no retransmits pending.
*/
if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
sk->retransmits = 0;
/*
* If the ack is newer than sent or older than previous acks
* then we can probably ignore it.
*/
if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
goto uninteresting_ack;
/*
* Have we discovered a larger window
*/
window_seq = ntohs(th->window);
if (window_seq > sk->max_window)
{
sk->max_window = window_seq;
#ifdef CONFIG_INET_PCTCP
/* Hack because we don't send partial packets to non SWS
handling hosts */
sk->mss = min(window_seq>>1, sk->mtu);
#else
sk->mss = min(window_seq, sk->mtu);
#endif
}
window_seq += ack;
/*
* See if our window has been shrunk.
*/
if (after(sk->window_seq, window_seq))
tcp_window_shrunk(sk, window_seq);
/*
* Pipe has emptied
*/
if (sk->send_tail == NULL || sk->send_head == NULL)
{
sk->send_head = NULL;
sk->send_tail = NULL;
sk->send_next = NULL;
sk->packets_out= 0;
}
/*
* We don't want too many packets out there.
*/
if (sk->ip_xmit_timeout == TIME_WRITE &&
sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
{
/*
* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328. Because we keep cong_window in integral
* mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
* counter and increment it once every cwnd times. It's possible
* that this should be done only if sk->retransmits == 0. I'm
* interpreting "new data is acked" as including data that has
* been retransmitted but is just now being acked.
*/
if (sk->cong_window <= sk->ssthresh)
/*
* In "safe" area, increase
*/
sk->cong_window++;
else
{
/*
* In dangerous area, increase slowly. In theory this is
* sk->cong_window += 1 / sk->cong_window
*/
if (sk->cong_count >= sk->cong_window)
{
sk->cong_window++;
sk->cong_count = 0;
}
else
sk->cong_count++;
}
}
/*
* Remember the highest ack received and update the
* right hand window edge of the host.
* We do a bit of work here to track number of times we've
* seen this ack without a change in the right edge of the
* window and no data in the packet.
* This will allow us to do fast retransmits.
*/
/* We are looking for duplicate ACKs here.
* An ACK is a duplicate if:
* (1) it has the same sequence number as the largest number we've seen,
* (2) it has the same window as the last ACK,
* (3) we have outstanding data that has not been ACKed
* (4) The packet was not carrying any data.
* (5) [From Floyd's paper on fast retransmit wars]
* The packet acked data after high_seq;
* I've tried to order these in occurrence of most likely to fail
* to least likely to fail.
* [These are an extension of the rules BSD stacks use to
* determine if an ACK is a duplicate.]
*/
if (sk->rcv_ack_seq == ack
&& sk->window_seq == window_seq
&& len != th->doff*4
&& before(ack, sk->sent_seq)
&& after(ack, sk->high_seq))
{
/* Prevent counting of duplicate ACKs if the congestion
* window is smaller than 3. Note that since we reduce
* the congestion window when we do a fast retransmit,
* we must be careful to keep counting if we were already
* counting. The idea behind this is to avoid doing
* fast retransmits if the congestion window is so small
* that we cannot get 3 ACKs due to the loss of a packet
* unless we are getting ACKs for retransmitted packets.
*/
if (sk->cong_window >= 3 || sk->rcv_ack_cnt > MAX_DUP_ACKS+1)
sk->rcv_ack_cnt++;
/* See draft-stevens-tcpca-spec-01 for explanation
* of what we are doing here.
*/
if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) {
int tmp;
/* We need to be a bit careful to preserve the
* count of packets that are out in the system here.
*/
sk->ssthresh = max(sk->cong_window >> 1, 2);
sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
tmp = sk->packets_out;
tcp_do_retransmit(sk,0);
sk->packets_out = tmp;
} else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) {
sk->cong_window++;
/*
* At this point we are suppose to transmit a NEW
* packet (not retransmit the missing packet,
* this would only get us into a retransmit war.)
* I think that having just adjusted cong_window
* we will transmit the new packet below.
*/
}
}
else
{
if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
sk->cong_window = sk->ssthresh;
}
sk->window_seq = window_seq;
sk->rcv_ack_seq = ack;
sk->rcv_ack_cnt = 1;
}
/*
* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->err_soft = 0;
/*
* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (sk->ip_xmit_timeout == TIME_PROBE0)
{
sk->retransmits = 0; /* Our probe was answered */
/*
* Was it a usable window open ?
*/
if (!skb_queue_empty(&sk->write_queue) && /* should always be true */
! before (sk->window_seq, sk->write_queue.next->end_seq))
{
sk->backoff = 0;
/*
* Recompute rto from rtt. this eliminates any backoff.
*/
/*
* Appendix C of Van Jacobson's final version of
* the SIGCOMM 88 paper states that although
* the original paper suggested that
* RTO = R*2V
* was the correct calculation experience showed
* better results using
* RTO = R*4V
* In particular this gives better performance over
* slow links, and should not effect fast links.
*
* Note: Jacobson's algorithm is fine on BSD which
* has a 1/2 second granularity clock, but with our
* 1/100 second granularity clock we become too
* sensitive to minor changes in the round trip time.
* We add in two compensating factors.
* First we multiply by 5/4. For large congestion
* windows this allows us to tolerate burst traffic
* delaying up to 1/4 of our packets.
* We also add in a rtt / cong_window term.
* For small congestion windows this allows
* a single packet delay, but has negligible effect
* on the compensation for large windows.
*/
sk->rto = (sk->rtt >> 3) + sk->mdev;
sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
if (sk->rto > 120*HZ)
sk->rto = 120*HZ;
if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
.2 of a second because of BSD delayed acks - on a 100Mb/sec link
.2 of a second is going to need huge windows (SIGH) */
sk->rto = HZ/5;
}
}
/*
* See if we can take anything off of the retransmit queue.
*/
for (;;) {
struct sk_buff * skb = sk->send_head;
if (!skb)
break;
/* Check for a bug. */
if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
printk("INET: tcp.c: *** bug send_list out of order.\n");
/*
* If our packet is before the ack sequence we can
* discard it as it's confirmed to have arrived the other end.
*/
if (after(skb->end_seq, ack))
break;
if (sk->retransmits)
{
/*
* We were retransmitting. don't count this in RTT est
*/
flag |= 2;
}
if ((sk->send_head = skb->link3) == NULL)
{
sk->send_tail = NULL;
sk->send_next = NULL;
sk->retransmits = 0;
}
/*
* advance the send_next pointer if needed.
*/
if (sk->send_next == skb)
sk->send_next = sk->send_head;
/*
* Note that we only reset backoff and rto in the
* rtt recomputation code. And that doesn't happen
* if there were retransmissions in effect. So the
* first new packet after the retransmissions is
* sent with the backoff still in effect. Not until
* we get an ack from a non-retransmitted packet do
* we reset the backoff and rto. This allows us to deal
* with a situation where the network delay has increased
* suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
/*
* We have one less packet out there.
*/
if (sk->packets_out > 0)
sk->packets_out --;
/* This is really only supposed to be called when we
* are actually ACKing new data, which should exclude
* the ACK handshake on an initial SYN packet as well.
* Rather than introducing a new test here for this
* special case, we just reset the initial values for
* rtt immediately after we move to the established state.
*/
if (!(flag&2)) /* Not retransmitting */
tcp_rtt_estimator(sk,skb);
IS_SKB(skb);
/*
* We may need to remove this from the dev send list.
*/
cli();
if (skb->next)
skb_unlink(skb);
sti();
kfree_skb(skb, FREE_WRITE); /* write. */
if (!sk->dead)
sk->write_space(sk);
}
/*
* Maybe we can take some stuff off of the write queue,
* and put it onto the xmit queue.
* There is bizarre case being tested here, to check if
* the data at the head of the queue ends before the start of
* the sequence we already ACKed. This is not an error,
* it can occur when we send a packet directly off of the write_queue
* in a zero window probe.
*/
if (!skb_queue_empty(&sk->write_queue) &&
!before(sk->window_seq, sk->write_queue.next->end_seq) &&
(sk->retransmits == 0 ||
sk->ip_xmit_timeout != TIME_WRITE ||
!after(sk->write_queue.next->end_seq, sk->rcv_ack_seq)) &&
sk->packets_out < sk->cong_window)
{
/*
* Add more data to the send queue.
*/
tcp_write_xmit(sk);
}
/*
* Reset timers to reflect the new state.
*
* from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
* from TCP_CLOSE we don't do anything
*
* from anything else, if there is queued data (or fin) pending,
* we use a TIME_WRITE timeout, if there is data to write but
* no room in the window we use TIME_PROBE0, else if keepalive
* we reset to a KEEPALIVE timeout, else we delete the timer.
*
* We do not set flag for nominal write data, otherwise we may
* force a state where we start to write itsy bitsy tidbits
* of data.
*/
switch(sk->state) {
case TCP_TIME_WAIT:
/*
* keep us in TIME_WAIT until we stop getting packets,
* reset the timeout.
*/
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
break;
case TCP_CLOSE:
/*
* don't touch the timer.
*/
break;
default:
/*
* Must check send_head and write_queue
* to determine which timeout to use.
*/
if (sk->send_head) {
tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
} else if (!skb_queue_empty(&sk->write_queue)
&& sk->ack_backlog == 0)
{
/*
* if the write queue is not empty when we get here
* then we failed to move any data to the retransmit
* queue above. (If we had send_head would be non-NULL).
* Furthermore, since the send_head is NULL here
* we must not be in retransmit mode at this point.
* This implies we have no packets in flight,
* hence sk->packets_out < sk->cong_window.
* Examining the conditions for the test to move
* data to the retransmission queue we find that
* we must therefore have a zero window.
* Hence, if the ack_backlog is 0 we should initiate
* a zero probe.
* We don't do a zero probe if we have a delayed
* ACK in hand since the other side may have a
* window opening, but they are waiting to hear
* from us before they tell us about it.
* (They are applying Nagle's rule).
* So, we don't set up the zero window probe
* just yet. We do have to clear the timer
* though in this case...
*/
tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
} else if (sk->keepopen) {
tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
} else {
del_timer(&sk->retransmit_timer);
sk->ip_xmit_timeout = 0;
}
break;
}
/*
* We have nothing queued but space to send. Send any partial
* packets immediately (end of Nagle rule application).
*/
if (sk->packets_out == 0
&& sk->partial != NULL
&& skb_queue_empty(&sk->write_queue)
&& sk->send_head == NULL)
{
tcp_send_partial(sk);
}
/*
* In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
* we are now waiting for an acknowledge to our FIN. The other end is
* already in TIME_WAIT.
*
* Move to TCP_CLOSE on success.
*/
if (sk->state == TCP_LAST_ACK)
{
if (!sk->dead)
sk->state_change(sk);
if(sk->debug)
printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
{
sk->shutdown = SHUTDOWN_MASK;
tcp_set_state(sk,TCP_CLOSE);
return 1;
}
}
/*
* Incoming ACK to a FIN we sent in the case of our initiating the close.
*
* Move to FIN_WAIT2 to await a FIN from the other end. Set
* SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
*/
if (sk->state == TCP_FIN_WAIT1)
{
if (!sk->dead)
sk->state_change(sk);
if (sk->rcv_ack_seq == sk->write_seq)
{
sk->shutdown |= SEND_SHUTDOWN;
tcp_set_state(sk, TCP_FIN_WAIT2);
/* If the socket is dead, then there is no
* user process hanging around using it.
* We want to set up a FIN_WAIT2 timeout ala BSD.
*/
if (sk->dead)
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
}
}
/*
* Incoming ACK to a FIN we sent in the case of a simultaneous close.
*
* Move to TIME_WAIT
*/
if (sk->state == TCP_CLOSING)
{
if (!sk->dead)
sk->state_change(sk);
if (sk->rcv_ack_seq == sk->write_seq)
{
tcp_time_wait(sk);
}
}
/*
* Final ack of a three way shake
*/
if (sk->state==TCP_SYN_RECV)
{
tcp_set_state(sk, TCP_ESTABLISHED);
tcp_options(sk,th);
sk->dummy_th.dest=th->source;
sk->copied_seq = sk->acked_seq;
if(!sk->dead)
sk->state_change(sk);
if(sk->max_window==0)
{
sk->max_window=32; /* Sanity check */
sk->mss=min(sk->max_window,sk->mtu);
}
/* Reset the RTT estimator to the initial
* state rather than testing to avoid
* updating it on the ACK to the SYN packet.
*/
sk->rtt = 0;
sk->rto = TCP_TIMEOUT_INIT;
sk->mdev = TCP_TIMEOUT_INIT;
}
/*
* The following code has been greatly simplified from the
* old hacked up stuff. The wonders of properly setting the
* retransmission timeouts.
*
* If we are retransmitting, and we acked a packet on the retransmit
* queue, and there is still something in the retransmit queue,
* then we can output some retransmission packets.
*/
if (sk->send_head != NULL && (flag&2) && sk->retransmits)
{
tcp_do_retransmit(sk, 1);
}
return 1;
uninteresting_ack:
if(sk->debug)
printk("Ack ignored %u %u\n",ack,sk->sent_seq);
/*
* Keepalive processing.
*/
if (after(ack, sk->sent_seq))
{
return 0;
}
/*
* Restart the keepalive timer.
*/
if (sk->keepopen)
{
if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
}
return 1;
}
/*
* Process the FIN bit. This now behaves as it is supposed to work
* and the FIN takes effect when it is validly part of sequence
* space. Not before when we get holes.
*
* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
* (and thence onto LAST-ACK and finally, CLOSE, we never enter
* TIME-WAIT)
*
* If we are in FINWAIT-1, a received FIN indicates simultaneous
* close and we go into CLOSING (and later onto TIME-WAIT)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*
*/
static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
sk->fin_seq = skb->end_seq;
if (!sk->dead)
{
sk->state_change(sk);
sock_wake_async(sk->socket, 1);
}
switch(sk->state)
{
case TCP_SYN_RECV:
case TCP_SYN_SENT:
case TCP_ESTABLISHED:
/*
* move to CLOSE_WAIT, tcp_data() already handled
* sending the ack.
*/
tcp_set_state(sk,TCP_CLOSE_WAIT);
if (th->rst)
sk->shutdown = SHUTDOWN_MASK;
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
/*
* received a retransmission of the FIN, do
* nothing.
*/
break;
case TCP_TIME_WAIT:
/*
* received a retransmission of the FIN,
* restart the TIME_WAIT timer.
*/
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
return(0);
case TCP_FIN_WAIT1:
/*
* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
*
* This causes a WRITE timeout, which will either
* move on to TIME_WAIT when we timeout, or resend
* the FIN properly (maybe we get rid of that annoying
* FIN lost hang). The TIME_WRITE code is already correct
* for handling this timeout.
*/
if (sk->ip_xmit_timeout != TIME_WRITE) {
if (sk->send_head)
tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
else if (sk->ip_xmit_timeout != TIME_PROBE0
|| skb_queue_empty(&sk->write_queue)) {
/* BUG check case.
* We have a problem here if there
* is no timer running [leads to
* frozen socket] or no data in the
* write queue [means we sent a fin
* and lost it from the queue before
* changing the ack properly].
*/
printk(KERN_ERR "Lost timer or fin packet in tcp_fin.\n");
}
}
tcp_set_state(sk,TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/*
* received a FIN -- send ACK and enter TIME_WAIT
*/
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
sk->shutdown|=SHUTDOWN_MASK;
tcp_set_state(sk,TCP_TIME_WAIT);
break;
case TCP_CLOSE:
/*
* already in CLOSE
*/
break;
default:
tcp_set_state(sk,TCP_LAST_ACK);
/* Start the timers. */
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
return(0);
}
return(0);
}
/*
* Add a sk_buff to the TCP receive queue, calculating
* the ACK sequence as we go..
*/
static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
{
struct sk_buff * prev, * next;
u32 seq;
/*
* Find where the new skb goes.. (This goes backwards,
* on the assumption that we get the packets in order)
*/
seq = skb->seq;
prev = list->prev;
next = (struct sk_buff *) list;
for (;;) {
if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
break;
next = prev;
prev = prev->prev;
}
__skb_insert(skb, prev, next, list);
}
/*
* Called for each packet when we find a new ACK endpoint sequence in it
*/
static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
{
/*
* When we ack the fin, we do the FIN
* processing.
*/
skb->acked = 1;
if (skb->h.th->fin)
tcp_fin(skb,sk,skb->h.th);
return skb->end_seq;
}
static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
{
u32 ack_seq;
tcp_insert_skb(skb, &sk->receive_queue);
/*
* Did we get anything new to ack?
*/
ack_seq = sk->acked_seq;
if (!after(skb->seq, ack_seq)) {
if (after(skb->end_seq, ack_seq)) {
/* the packet straddles our window end */
struct sk_buff_head * list = &sk->receive_queue;
struct sk_buff * next;
ack_seq = tcp_queue_ack(skb, sk);
/*
* Do we have any old packets to ack that the above
* made visible? (Go forward from skb)
*/
next = skb->next;
while (next != (struct sk_buff *) list) {
if (after(next->seq, ack_seq))
break;
if (after(next->end_seq, ack_seq))
ack_seq = tcp_queue_ack(next, sk);
next = next->next;
}
/*
* Ok, we found new data, update acked_seq as
* necessary (and possibly send the actual
* ACK packet).
*/
sk->acked_seq = ack_seq;
} else {
if (sk->debug)
printk("Ack duplicate packet.\n");
tcp_send_ack(sk);
return;
}
/*
* Delay the ack if possible. Send ack's to
* fin frames immediately as there shouldn't be
* anything more to come.
*/
if (!sk->delay_acks || th->fin) {
tcp_send_ack(sk);
} else {
/*
* If psh is set we assume it's an
* interactive session that wants quick
* acks to avoid nagling too much.
*/
int delay = HZ/2;
if (th->psh)
delay = HZ/50;
tcp_send_delayed_ack(sk, delay, sk->ato);
}
/*
* Tell the user we have some more data.
*/
if (!sk->dead)
sk->data_ready(sk,0);
}
else
{
/*
* If we've missed a packet, send an ack.
* Also start a timer to send another.
*
* 4.3reno machines look for these kind of acks so
* they can do fast recovery. Three identical 'old'
* acks lets it know that one frame has been lost
* and should be resent. Because this is before the
* whole window of data has timed out it can take
* one lost frame per window without stalling.
* [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
*
* We also should be spotting triple bad sequences.
* [We now do this.]
*
*/
if (!skb->acked)
{
if(sk->debug)
printk("Ack past end of seq packet.\n");
tcp_send_ack(sk);
/*
* We need to be very careful here. We must
* not violate Jacobsons packet conservation condition.
* This means we should only send an ACK when a packet
* leaves the network. We can say a packet left the
* network when we see a packet leave the network, or
* when an rto measure expires.
*/
tcp_send_delayed_ack(sk,sk->rto,sk->rto);
}
}
}
/*
* This routine handles the data. If there is room in the buffer,
* it will be have already been moved into it. If there is no
* room, then we will just have to discard the packet.
*/
static int tcp_data(struct sk_buff *skb, struct sock *sk,
unsigned long saddr, unsigned int len)
{
struct tcphdr *th;
u32 new_seq, shut_seq;
th = skb->h.th;
skb_pull(skb,th->doff*4);
skb_trim(skb,len-(th->doff*4));
/*
* The bytes in the receive read/assembly queue has increased. Needed for the
* low memory discard algorithm
*/
sk->bytes_rcv += skb->len;
if (skb->len == 0 && !th->fin)
{
/*
* Don't want to keep passing ack's back and forth.
* (someone sent us dataless, boring frame)
*/
if (!th->ack)
tcp_send_ack(sk);
kfree_skb(skb, FREE_READ);
return(0);
}
/*
* We no longer have anyone receiving data on this connection.
*/
#ifndef TCP_DONT_RST_SHUTDOWN
if(sk->shutdown & RCV_SHUTDOWN)
{
/*
* FIXME: BSD has some magic to avoid sending resets to
* broken 4.2 BSD keepalives. Much to my surprise a few non
* BSD stacks still have broken keepalives so we want to
* cope with it.
*/
if(skb->len) /* We don't care if it's just an ack or
a keepalive/window probe */
{
new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
/* Do this the way 4.4BSD treats it. Not what I'd
regard as the meaning of the spec but it's what BSD
does and clearly they know everything 8) */
/*
* This is valid because of two things
*
* a) The way tcp_data behaves at the bottom.
* b) A fin takes effect when read not when received.
*/
shut_seq = sk->acked_seq+1; /* Last byte */
if(after(new_seq,shut_seq))
{
if(sk->debug)
printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
sk, new_seq, shut_seq, sk->blog);
if(sk->dead)
{
sk->acked_seq = new_seq + th->fin;
tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
sk->prot, NULL, skb->dev, 0, 255);
tcp_statistics.TcpEstabResets++;
sk->err = EPIPE;
sk->error_report(sk);
sk->shutdown = SHUTDOWN_MASK;
tcp_set_state(sk,TCP_CLOSE);
kfree_skb(skb, FREE_READ);
return 0;
}
}
}
}
#endif
/*
* We should only call this if there is data in the frame.
*/
tcp_delack_estimator(sk);
tcp_queue(skb, sk, th);
return(0);
}
/*
* This routine is only called when we have urgent data
* signalled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961.
*
* For 1003.1g we should support a new option TCP_STDURG to permit
* either form.
*/
static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
{
u32 ptr = ntohs(th->urg_ptr);
if (ptr)
ptr--;
ptr += ntohl(th->seq);
/* ignore urgent data that we've already seen and read */
if (after(sk->copied_seq, ptr))
return;
/* do we already have a newer (or duplicate) urgent pointer? */
if (sk->urg_data && !after(ptr, sk->urg_seq))
return;
/* tell the world about our new urgent pointer */
if (sk->proc != 0) {
if (sk->proc > 0) {
kill_proc(sk->proc, SIGURG, 1);
} else {
kill_pg(-sk->proc, SIGURG, 1);
}
}
/*
* We may be adding urgent data when the last byte read was
* urgent. To do this requires some care. We cannot just ignore
* sk->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the sematics of SIOCATMARK (and thus sockatmark())
*/
if (sk->urg_seq == sk->copied_seq)
sk->copied_seq++; /* Move the copied sequence on correctly */
sk->urg_data = URG_NOTYET;
sk->urg_seq = ptr;
}
/*
* This is the 'fast' part of urgent handling.
*/
static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
{
/*
* Check if we get a new urgent pointer - normally not
*/
if (th->urg)
tcp_check_urg(sk,th);
/*
* Do we wait for any urgent data? - normally not
*/
if (sk->urg_data == URG_NOTYET) {
u32 ptr;
/*
* Is the urgent pointer pointing into this packet?
*/
ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
if (ptr < len) {
sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
if (!sk->dead)
sk->data_ready(sk,0);
}
}
}
/*
* This should be a bit smarter and remove partially
* overlapping stuff too, but this should be good
* enough for any even remotely normal case (and the
* worst that can happen is that we have a few
* unnecessary packets in the receive queue).
*
* This function is never called with an empty list..
*/
static inline void tcp_remove_dups(struct sk_buff_head * list)
{
struct sk_buff * next = list->next;
for (;;) {
struct sk_buff * skb = next;
next = next->next;
if (next == (struct sk_buff *) list)
break;
if (before(next->end_seq, skb->end_seq)) {
__skb_unlink(next, list);
kfree_skb(next, FREE_READ);
next = skb;
continue;
}
if (next->seq != skb->seq)
continue;
__skb_unlink(skb, list);
kfree_skb(skb, FREE_READ);
}
}
/*
* Throw out all unnecessary packets: we've gone over the
* receive queue limit. This shouldn't happen in a normal
* TCP connection, but we might have gotten duplicates etc.
*/
static void prune_queue(struct sk_buff_head * list)
{
for (;;) {
struct sk_buff * skb = list->prev;
/* gone through it all? */
if (skb == (struct sk_buff *) list)
break;
if (!skb->acked) {
__skb_unlink(skb, list);
kfree_skb(skb, FREE_READ);
continue;
}
tcp_remove_dups(list);
break;
}
}
#ifdef CONFIG_IP_TRANSPARENT_PROXY
/*
* Check whether a received TCP packet might be for one of our
* connections.
*/
int tcp_chkaddr(struct sk_buff *skb)
{
struct iphdr *iph = skb->h.iph;
struct tcphdr *th = (struct tcphdr *)(skb->h.raw + iph->ihl*4);
struct sock *sk;
sk = get_sock(&tcp_prot, th->dest, iph->saddr, th->source, iph->daddr, 0, 0);
if (!sk) return 0;
/* 0 means accept all LOCAL addresses here, not all the world... */
if (sk->rcv_saddr == 0) return 0;
return 1;
}
#endif
/*
* A TCP packet has arrived.
* skb->h.raw is the TCP header.
*/
int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
__u32 daddr, unsigned short len,
__u32 saddr, int redo, struct inet_protocol * protocol)
{
struct tcphdr *th;
struct sock *sk;
__u32 seq;
#ifdef CONFIG_IP_TRANSPARENT_PROXY
int r;
#endif
/*
* "redo" is 1 if we have already seen this skb but couldn't
* use it at that time (the socket was locked). In that case
* we have already done a lot of the work (looked up the socket
* etc).
*/
th = skb->h.th;
sk = skb->sk;
if (!redo) {
tcp_statistics.TcpInSegs++;
if (skb->pkt_type!=PACKET_HOST)
goto discard_it;
/*
* Pull up the IP header.
*/
skb_pull(skb, skb->h.raw-skb->data);
/*
* Try to use the device checksum if provided.
*/
switch (skb->ip_summed)
{
case CHECKSUM_NONE:
skb->csum = csum_partial((char *)th, len, 0);
case CHECKSUM_HW:
if (tcp_check(th, len, saddr, daddr, skb->csum))
goto discard_it;
default:
/* CHECKSUM_UNNECESSARY */
}
sk = get_tcp_sock(saddr, th->source, daddr, th->dest, dev->pa_addr, skb->redirport);
if (!sk)
goto no_tcp_socket;
skb->sk = sk;
skb->seq = ntohl(th->seq);
skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
skb->ack_seq = ntohl(th->ack_seq);
skb->acked = 0;
skb->used = 0;
skb->free = 1;
skb->saddr = daddr;
skb->daddr = saddr;
/*
* We may need to add it to the backlog here.
*/
if (sk->users)
{
__skb_queue_tail(&sk->back_log, skb);
return(0);
}
}
/*
* If this socket has got a reset it's to all intents and purposes
* really dead. Count closed sockets as dead.
*
* Note: BSD appears to have a bug here. A 'closed' TCP in BSD
* simply drops data. This seems incorrect as a 'closed' TCP doesn't
* exist so should cause resets as if the port was unreachable.
*/
if (sk->zapped || sk->state==TCP_CLOSE)
goto no_tcp_socket;
if (!sk->prot)
{
printk(KERN_CRIT "IMPOSSIBLE 3\n");
return(0);
}
/*
* Charge the memory to the socket.
*/
skb->sk=sk;
atomic_add(skb->truesize, &sk->rmem_alloc);
/*
* Mark the time of the last received packet.
*/
sk->idletime = jiffies;
/*
* We should now do header prediction.
*/
/*
* This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
* don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
* compatibility. We also set up variables more thoroughly [Karn notes in the
* KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
*/
if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
{
/*
* Now deal with unusual cases.
*/
if(sk->state==TCP_LISTEN)
{
if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,0, 255);
/*
* We don't care for RST, and non SYN are absorbed (old segments)
* Broadcast/multicast SYN isn't allowed. Note - bug if you change the
* netmask on a running connection it can go broadcast. Even Sun's have
* this problem so I'm ignoring it
*/
#ifdef CONFIG_IP_TRANSPARENT_PROXY
/*
* We may get non-local addresses and still want to
* handle them locally, due to transparent proxying.
* Thus, narrow down the test to what is really meant.
*/
if(th->rst || !th->syn || th->ack || (r = ip_chk_addr(daddr)) == IS_BROADCAST || r == IS_MULTICAST)
#else
if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
#endif
{
kfree_skb(skb, FREE_READ);
return 0;
}
/*
* Guess we need to make a new socket up
*/
seq = secure_tcp_sequence_number(saddr, daddr,
skb->h.th->dest,
skb->h.th->source);
tcp_conn_request(sk, skb, daddr, saddr, opt, dev, seq);
/*
* Now we have several options: In theory there is nothing else
* in the frame. KA9Q has an option to send data with the syn,
* BSD accepts data with the syn up to the [to be] advertised window
* and Solaris 2.1 gives you a protocol error. For now we just ignore
* it, that fits the spec precisely and avoids incompatibilities. It
* would be nice in future to drop through and process the data.
*
* Now TTCP is starting to use we ought to queue this data.
*/
return 0;
}
/*
* Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
* then it's a new connection
*/
if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
{
kfree_skb(skb, FREE_READ);
return 0;
}
/*
* SYN sent means we have to look for a suitable ack and either reset
* for bad matches or go to connected. The SYN_SENT case is unusual and should
* not be in line code. [AC]
*/
if(sk->state==TCP_SYN_SENT)
{
/* Crossed SYN or previous junk segment */
if(th->ack)
{
/* We got an ack, but it's not a good ack.
* We used to test this with a call to tcp_ack,
* but this loses, because it takes the SYN
* packet out of the send queue, even if
* the ACK doesn't have the SYN bit sent, and
* therefore isn't the one we are waiting for.
*/
if (after(skb->ack_seq, sk->sent_seq) || before(skb->ack_seq, sk->rcv_ack_seq))
{
/* Reset the ack - it's an ack from a
different connection [ th->rst is checked in tcp_send_reset()] */
tcp_statistics.TcpAttemptFails++;
tcp_send_reset(daddr, saddr, th,
sk->prot, opt,dev,0,255);
kfree_skb(skb, FREE_READ);
return(0);
}
if(th->rst)
return tcp_reset(sk,skb);
if(!th->syn)
{
/* A valid ack from a different connection
start. Shouldn't happen but cover it */
tcp_statistics.TcpAttemptFails++;
tcp_send_reset(daddr, saddr, th,
sk->prot, opt,dev,0,255);
kfree_skb(skb, FREE_READ);
return 0;
}
/* process the ACK, get the SYN packet out
* of the send queue, do other initial
* processing stuff. [We know it's good, and
* we know it's the SYN,ACK we want.]
*/
tcp_ack(sk,th,skb->ack_seq,len);
/*
* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
sk->acked_seq = skb->seq+1;
sk->lastwin_seq = skb->seq+1;
sk->fin_seq = skb->seq;
tcp_send_ack(sk);
tcp_set_state(sk, TCP_ESTABLISHED);
tcp_options(sk,th);
sk->dummy_th.dest=th->source;
sk->copied_seq = sk->acked_seq;
if(!sk->dead)
{
sk->state_change(sk);
sock_wake_async(sk->socket, 0);
}
if(sk->max_window==0)
{
sk->max_window = 32;
sk->mss = min(sk->max_window, sk->mtu);
}
/* Reset the RTT estimator to the initial
* state rather than testing to avoid
* updating it on the ACK to the SYN packet.
*/
sk->rtt = 0;
sk->rto = TCP_TIMEOUT_INIT;
sk->mdev = TCP_TIMEOUT_INIT;
}
else
{
/* See if SYN's cross. Drop if boring */
if(th->syn && !th->rst)
{
/* Crossed SYN's are fine - but talking to
yourself is right out... */
if(sk->saddr==saddr && sk->daddr==daddr &&
sk->dummy_th.source==th->source &&
sk->dummy_th.dest==th->dest)
{
tcp_statistics.TcpAttemptFails++;
return tcp_reset(sk,skb);
}
tcp_set_state(sk,TCP_SYN_RECV);
/*
* FIXME:
* Must send SYN|ACK here
*/
}
/* Discard junk segment */
kfree_skb(skb, FREE_READ);
return 0;
}
/*
* SYN_RECV with data maybe.. drop through
*/
goto rfc_step6;
}
/*
* BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
* a more complex suggestion for fixing these reuse issues in RFC1644
* but not yet ready for general use. Also see RFC1379.
*
* Note the funny way we go back to the top of this function for
* this case ("goto try_next_socket"). That also takes care of
* checking "sk->users" for the new socket as well as doing all
* the normal tests on the packet.
*/
#define BSD_TIME_WAIT
#ifdef BSD_TIME_WAIT
if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
after(skb->seq, sk->acked_seq) && !th->rst)
{
u32 seq = sk->write_seq;
if(sk->debug)
printk("Doing a BSD time wait\n");
tcp_statistics.TcpEstabResets++;
atomic_sub(skb->truesize, &sk->rmem_alloc);
skb->sk = NULL;
sk->err=ECONNRESET;
tcp_set_state(sk, TCP_CLOSE);
sk->shutdown = SHUTDOWN_MASK;
sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr, dev->pa_addr, skb->redirport);
/* this is not really correct: we should check sk->users */
if (sk && sk->state==TCP_LISTEN)
{
skb->sk = sk;
atomic_add(skb->truesize, &sk->rmem_alloc);
tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
return 0;
}
kfree_skb(skb, FREE_READ);
return 0;
}
#endif
}
/*
* We are now in normal data flow (see the step list in the RFC)
* Note most of these are inline now. I'll inline the lot when
* I have time to test it hard and look at what gcc outputs
*/
if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
{
bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
kfree_skb(skb, FREE_READ);
return 0;
}
if(th->rst)
return tcp_reset(sk,skb);
/*
* Check for a SYN, and ensure it matches the SYN we were
* first sent. We have to handle the rather unusual (but valid)
* sequence that KA9Q derived products may generate of
*
* SYN
* SYN|ACK Data
* ACK (lost)
* SYN|ACK Data + More Data
* .. we must ACK not RST...
*
* We keep syn_seq as the sequence space occupied by the
* original syn.
*/
if(th->syn && skb->seq!=sk->syn_seq)
{
tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev,0, 255);
return tcp_reset(sk,skb);
}
/*
* Process the ACK
*/
if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
{
/*
* Our three way handshake failed.
*/
if(sk->state==TCP_SYN_RECV)
{
tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,0,255);
}
kfree_skb(skb, FREE_READ);
return 0;
}
rfc_step6: /* I'll clean this up later */
/*
* If the accepted buffer put us over our queue size we
* now drop it (we must process the ack first to avoid
* deadlock cases).
*/
/*
* Process urgent data
*/
tcp_urg(sk, th, len);
/*
* Process the encapsulated data
*/
if(tcp_data(skb,sk, saddr, len))
kfree_skb(skb, FREE_READ);
/*
* If our receive queue has grown past its limits,
* try to prune away duplicates etc..
*/
if (sk->rmem_alloc > sk->rcvbuf)
prune_queue(&sk->receive_queue);
/*
* And done
*/
return 0;
no_tcp_socket:
/*
* No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
*/
tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,0,255);
discard_it:
/*
* Discard frame
*/
skb->sk = NULL;
kfree_skb(skb, FREE_READ);
return 0;
}