changelog shortlog tags branches changeset files revisions annotate raw help

Mercurial > hg > plan9front / sys/src/9/ip/tcp.c

changeset 7252: 523d2d3e473f
parent: c28a5e9a92d5
author: cinap_lenrek@felloff.net
date: Wed, 22 May 2019 22:20:31 +0200
permissions: -rw-r--r--
description: devip: if the server does not support TCP ws option, disable window scaling (thanks joe9)

if the server responds without a window scale option in
its syn-ack, disable window scaling alltogether as both
sides need to understand the option.
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/error.h"
7 
8 #include "ip.h"
9 
10 enum
11 {
12  QMAX = 64*1024-1,
13  IP_TCPPROTO = 6,
14 
15  TCP4_IPLEN = 8,
16  TCP4_PHDRSIZE = 12,
17  TCP4_HDRSIZE = 20,
18  TCP4_TCBPHDRSZ = 40,
19  TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
20 
21  TCP6_IPLEN = 0,
22  TCP6_PHDRSIZE = 40,
23  TCP6_HDRSIZE = 20,
24  TCP6_TCBPHDRSZ = 60,
25  TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
26 
27  TcptimerOFF = 0,
28  TcptimerON = 1,
29  TcptimerDONE = 2,
30  MAX_TIME = (1<<20), /* Forever */
31  TCP_ACK = 50, /* Timed ack sequence in ms */
32  MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
33 
34  URG = 0x20, /* Data marked urgent */
35  ACK = 0x10, /* Acknowledge is valid */
36  PSH = 0x08, /* Whole data pipe is pushed */
37  RST = 0x04, /* Reset connection */
38  SYN = 0x02, /* Pkt. is synchronise */
39  FIN = 0x01, /* Start close down */
40 
41  EOLOPT = 0,
42  NOOPOPT = 1,
43  MSSOPT = 2,
44  MSS_LENGTH = 4, /* Maximum segment size */
45  WSOPT = 3,
46  WS_LENGTH = 3, /* Bits to scale window size by */
47  MSL2 = 10,
48  MSPTICK = 50, /* Milliseconds per timer tick */
49  DEF_MSS = 1460, /* Default maximum segment */
50  DEF_MSS6 = 1220, /* Default maximum segment (min) for v6 */
51  DEF_RTT = 500, /* Default round trip */
52  DEF_KAT = 120000, /* Default time (ms) between keep alives */
53  TCP_LISTEN = 0, /* Listen connection */
54  TCP_CONNECT = 1, /* Outgoing connection */
55  SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
56 
57  TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
58 
59  FORCE = 1,
60  CLONE = 2,
61  RETRAN = 4,
62  ACTIVE = 8,
63  SYNACK = 16,
64 
65  LOGAGAIN = 3,
66  LOGDGAIN = 2,
67 
68  Closed = 0, /* Connection states */
69  Listen,
70  Syn_sent,
71  Syn_received,
72  Established,
73  Finwait1,
74  Finwait2,
75  Close_wait,
76  Closing,
77  Last_ack,
78  Time_wait,
79 
80  Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81  NLHT = 256, /* hash table size, must be a power of 2 */
82  LHTMASK = NLHT-1,
83 
84  /*
85  * window is 64kb · 2ⁿ
86  * these factors determine the ultimate bandwidth-delay product.
87  * 64kb · 2⁵ = 2mb, or 2x overkill for 100mbps · 70ms.
88  */
89  Maxqscale = 4, /* maximum queuing scale */
90  Defadvscale = 4, /* default advertisement */
91 };
92 
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96  "Closed", "Listen", "Syn_sent", "Syn_received",
97  "Established", "Finwait1", "Finwait2", "Close_wait",
98  "Closing", "Last_ack", "Time_wait"
99 };
100 
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104  Tcptimer *next;
105  Tcptimer *prev;
106  Tcptimer *readynext;
107  int state;
108  int start;
109  int count;
110  void (*func)(void*);
111  void *arg;
112 };
113 
114 /*
115  * v4 and v6 pseudo headers used for
116  * checksuming tcp
117  */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121  uchar vihl; /* Version and header length */
122  uchar tos; /* Type of service */
123  uchar length[2]; /* packet length */
124  uchar id[2]; /* Identification */
125  uchar frag[2]; /* Fragment information */
126  uchar Unused;
127  uchar proto;
128  uchar tcplen[2];
129  uchar tcpsrc[4];
130  uchar tcpdst[4];
131  uchar tcpsport[2];
132  uchar tcpdport[2];
133  uchar tcpseq[4];
134  uchar tcpack[4];
135  uchar tcpflag[2];
136  uchar tcpwin[2];
137  uchar tcpcksum[2];
138  uchar tcpurg[2];
139  /* Options segment */
140  uchar tcpopt[1];
141 };
142 
143 typedef struct Tcp6hdr Tcp6hdr;
144 struct Tcp6hdr
145 {
146  uchar vcf[4];
147  uchar ploadlen[2];
148  uchar proto;
149  uchar ttl;
150  uchar tcpsrc[IPaddrlen];
151  uchar tcpdst[IPaddrlen];
152  uchar tcpsport[2];
153  uchar tcpdport[2];
154  uchar tcpseq[4];
155  uchar tcpack[4];
156  uchar tcpflag[2];
157  uchar tcpwin[2];
158  uchar tcpcksum[2];
159  uchar tcpurg[2];
160  /* Options segment */
161  uchar tcpopt[1];
162 };
163 
164 /*
165  * this represents the control info
166  * for a single packet. It is derived from
167  * a packet in ntohtcp{4,6}() and stuck into
168  * a packet in htontcp{4,6}().
169  */
170 typedef struct Tcp Tcp;
171 struct Tcp
172 {
173  ushort source;
174  ushort dest;
175  ulong seq;
176  ulong ack;
177  uchar flags;
178  uchar update;
179  ushort ws; /* window scale option */
180  ulong wnd; /* prescaled window*/
181  ushort urg;
182  ushort mss; /* max segment size option (if not zero) */
183  ushort len; /* size of data */
184 };
185 
186 /*
187  * this header is malloc'd to thread together fragments
188  * waiting to be coalesced
189  */
190 typedef struct Reseq Reseq;
191 struct Reseq
192 {
193  Reseq *next;
194  Tcp seg;
195  Block *bp;
196  ushort length;
197 };
198 
199 /*
200  * the qlock in the Conv locks this structure
201  */
202 typedef struct Tcpctl Tcpctl;
203 struct Tcpctl
204 {
205  uchar state; /* Connection state */
206  uchar type; /* Listening or active connection */
207  uchar code; /* Icmp code */
208  struct {
209  ulong una; /* Unacked data pointer */
210  ulong nxt; /* Next sequence expected */
211  ulong ptr; /* Data pointer */
212  ulong wnd; /* Tcp send window */
213  ulong urg; /* Urgent data pointer */
214  ulong wl2;
215  uint scale; /* how much to right shift window in xmitted packets */
216  /* to implement tahoe and reno TCP */
217  ulong dupacks; /* number of duplicate acks rcvd */
218  ulong partialack;
219  int recovery; /* loss recovery flag */
220  int retransmit; /* retransmit 1 packet @ una flag */
221  int rto;
222  ulong rxt; /* right window marker for recovery "recover" rfc3782 */
223  } snd;
224  struct {
225  ulong nxt; /* Receive pointer to next uchar slot */
226  ulong wnd; /* Receive window incoming */
227  ulong wsnt; /* Last wptr sent. important to track for large bdp */
228  ulong wptr;
229  ulong urg; /* Urgent pointer */
230  ulong ackptr; /* last acked sequence */
231  int blocked;
232  uint scale; /* how much to left shift window in rcv'd packets */
233  } rcv;
234  ulong iss; /* Initial sequence number */
235  ulong cwind; /* Congestion window */
236  ulong abcbytes; /* appropriate byte counting rfc 3465 */
237  uint scale; /* desired snd.scale */
238  ulong ssthresh; /* Slow start threshold */
239  int resent; /* Bytes just resent */
240  int irs; /* Initial received squence */
241  ushort mss; /* Maximum segment size */
242  int rerecv; /* Overlap of data rerecevived */
243  ulong window; /* Our receive window (queue) */
244  uint qscale; /* Log2 of our receive window (queue) */
245  uchar backoff; /* Exponential backoff counter */
246  int backedoff; /* ms we've backed off for rexmits */
247  uchar flags; /* State flags */
248  Reseq *reseq; /* Resequencing queue */
249  int nreseq;
250  int reseqlen;
251  Tcptimer timer; /* Activity timer */
252  Tcptimer acktimer; /* Acknowledge timer */
253  Tcptimer rtt_timer; /* Round trip timer */
254  Tcptimer katimer; /* keep alive timer */
255  ulong rttseq; /* Round trip sequence */
256  int srtt; /* Smoothed round trip */
257  int mdev; /* Mean deviation of round trip */
258  int kacounter; /* count down for keep alive */
259  uint sndsyntime; /* time syn sent */
260  ulong time; /* time Finwait2 or Syn_received was sent */
261  ulong timeuna; /* snd.una when time was set */
262  int nochecksum; /* non-zero means don't send checksums */
263  int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
264 
265  union {
266  Tcp4hdr tcp4hdr;
267  Tcp6hdr tcp6hdr;
268  } protohdr; /* prototype header */
269 };
270 
271 /*
272  * New calls are put in limbo rather than having a conversation structure
273  * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
274  * any real Conv structures mucking things up. Calls in limbo rexmit their
275  * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
276  *
277  * In particular they aren't on a listener's queue so that they don't figure
278  * in the input queue limit.
279  *
280  * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
281  * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
282  * there is no hashing of this list.
283  */
284 typedef struct Limbo Limbo;
285 struct Limbo
286 {
287  Limbo *next;
288 
289  uchar laddr[IPaddrlen];
290  uchar raddr[IPaddrlen];
291  ushort lport;
292  ushort rport;
293  ulong irs; /* initial received sequence */
294  ulong iss; /* initial sent sequence */
295  ushort mss; /* mss from the other end */
296  ushort rcvscale; /* how much to scale rcvd windows */
297  ushort sndscale; /* how much to scale sent windows */
298  ulong lastsend; /* last time we sent a synack */
299  uchar version; /* v4 or v6 */
300  uchar rexmits; /* number of retransmissions */
301 };
302 
303 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
304 
305 enum {
306  /* MIB stats */
307  MaxConn,
308  Mss,
309  ActiveOpens,
310  PassiveOpens,
311  EstabResets,
312  CurrEstab,
313  InSegs,
314  OutSegs,
315  RetransSegs,
316  RetransSegsSent,
317  RetransTimeouts,
318  InErrs,
319  OutRsts,
320 
321  /* non-MIB stats */
322  CsumErrs,
323  HlenErrs,
324  LenErrs,
325  Resequenced,
326  OutOfOrder,
327  ReseqBytelim,
328  ReseqPktlim,
329  Delayack,
330  Wopenack,
331 
332  Recovery,
333  RecoveryDone,
334  RecoveryRTO,
335  RecoveryNoSeq,
336  RecoveryCwind,
337  RecoveryPA,
338 
339  Nstats
340 };
341 
342 static char *statnames[Nstats] =
343 {
344 [MaxConn] "MaxConn",
345 [Mss] "MaxSegment",
346 [ActiveOpens] "ActiveOpens",
347 [PassiveOpens] "PassiveOpens",
348 [EstabResets] "EstabResets",
349 [CurrEstab] "CurrEstab",
350 [InSegs] "InSegs",
351 [OutSegs] "OutSegs",
352 [RetransSegs] "RetransSegs",
353 [RetransSegsSent] "RetransSegsSent",
354 [RetransTimeouts] "RetransTimeouts",
355 [InErrs] "InErrs",
356 [OutRsts] "OutRsts",
357 [CsumErrs] "CsumErrs",
358 [HlenErrs] "HlenErrs",
359 [LenErrs] "LenErrs",
360 [OutOfOrder] "OutOfOrder",
361 [Resequenced] "Resequenced",
362 [ReseqBytelim] "ReseqBytelim",
363 [ReseqPktlim] "ReseqPktlim",
364 [Delayack] "Delayack",
365 [Wopenack] "Wopenack",
366 
367 [Recovery] "Recovery",
368 [RecoveryDone] "RecoveryDone",
369 [RecoveryRTO] "RecoveryRTO",
370 
371 [RecoveryNoSeq] "RecoveryNoSeq",
372 [RecoveryCwind] "RecoveryCwind",
373 [RecoveryPA] "RecoveryPA",
374 };
375 
376 typedef struct Tcppriv Tcppriv;
377 struct Tcppriv
378 {
379  /* List of active timers */
380  QLock tl;
381  Tcptimer *timers;
382 
383  /* hash table for matching conversations */
384  Ipht ht;
385 
386  /* calls in limbo waiting for an ACK to our SYN ACK */
387  int nlimbo;
388  Limbo *lht[NLHT];
389 
390  /* for keeping track of tcpackproc */
391  QLock apl;
392  int ackprocstarted;
393 
394  uvlong stats[Nstats];
395 };
396 
397 /*
398  * Setting tcpporthogdefense to non-zero enables Dong Lin's
399  * solution to hijacked systems staking out port's as a form
400  * of DoS attack.
401  *
402  * To avoid stateless Conv hogs, we pick a sequence number at random. If
403  * that number gets acked by the other end, we shut down the connection.
404  * Look for tcpporthogdefense in the code.
405  */
406 int tcpporthogdefense = 0;
407 
408 static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
409 static int dumpreseq(Tcpctl*);
410 static void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
411 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
412 static void limborexmit(Proto*);
413 static void localclose(Conv*, char*);
414 static void procsyn(Conv*, Tcp*);
415 static void tcpacktimer(void*);
416 static void tcpiput(Proto*, Ipifc*, Block*);
417 static void tcpkeepalive(void*);
418 static void tcpoutput(Conv*);
419 static void tcprcvwin(Conv*);
420 static void tcprxmit(Conv*);
421 static void tcpsetkacounter(Tcpctl*);
422 static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
423 static void tcpsettimer(Tcpctl*);
424 static void tcpsndsyn(Conv*, Tcpctl*);
425 static void tcpstart(Conv*, int);
426 static void tcpsynackrtt(Conv*);
427 static void tcptimeout(void*);
428 static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
429 
430 static void
431 tcpsetstate(Conv *s, uchar newstate)
432 {
433  Tcpctl *tcb;
434  uchar oldstate;
435  Tcppriv *tpriv;
436 
437  tpriv = s->p->priv;
438 
439  tcb = (Tcpctl*)s->ptcl;
440 
441  oldstate = tcb->state;
442  if(oldstate == newstate)
443  return;
444 
445  if(oldstate == Established)
446  tpriv->stats[CurrEstab]--;
447  if(newstate == Established)
448  tpriv->stats[CurrEstab]++;
449 
450  switch(newstate) {
451  case Closed:
452  qclose(s->rq);
453  qclose(s->wq);
454  qclose(s->eq);
455  break;
456 
457  case Close_wait: /* Remote closes */
458  qhangup(s->rq, nil);
459  break;
460  }
461 
462  tcb->state = newstate;
463 
464  if(oldstate == Syn_sent && newstate != Closed)
465  Fsconnected(s, nil);
466 }
467 
468 static char*
469 tcpconnect(Conv *c, char **argv, int argc)
470 {
471  char *e;
472  Tcpctl *tcb;
473 
474  tcb = (Tcpctl*)(c->ptcl);
475  if(tcb->state != Closed)
476  return Econinuse;
477 
478  e = Fsstdconnect(c, argv, argc);
479  if(e != nil)
480  return e;
481  tcpstart(c, TCP_CONNECT);
482 
483  return nil;
484 }
485 
486 static int
487 tcpstate(Conv *c, char *state, int n)
488 {
489  Tcpctl *s;
490 
491  s = (Tcpctl*)(c->ptcl);
492 
493  return snprint(state, n,
494  "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
495  tcpstates[s->state],
496  c->rq ? qlen(c->rq) : 0,
497  c->wq ? qlen(c->wq) : 0,
498  s->nreseq, s->reseqlen,
499  s->srtt, s->mdev, s->ssthresh,
500  s->cwind, s->snd.wnd, s->snd.scale, s->rcv.wnd, s->rcv.scale,
501  s->qscale,
502  s->timer.start, s->timer.count, s->rerecv,
503  s->katimer.start, s->katimer.count);
504 }
505 
506 static int
507 tcpinuse(Conv *c)
508 {
509  Tcpctl *s;
510 
511  s = (Tcpctl*)(c->ptcl);
512  return s->state != Closed;
513 }
514 
515 static char*
516 tcpannounce(Conv *c, char **argv, int argc)
517 {
518  char *e;
519  Tcpctl *tcb;
520 
521  tcb = (Tcpctl*)(c->ptcl);
522  if(tcb->state != Closed)
523  return Econinuse;
524 
525  e = Fsstdannounce(c, argv, argc);
526  if(e != nil)
527  return e;
528  tcpstart(c, TCP_LISTEN);
529  Fsconnected(c, nil);
530 
531  return nil;
532 }
533 
534 /*
535  * tcpclose is always called with the q locked
536  */
537 static void
538 tcpclose(Conv *c)
539 {
540  Tcpctl *tcb;
541 
542  tcb = (Tcpctl*)c->ptcl;
543 
544  qhangup(c->rq, nil);
545  qhangup(c->wq, nil);
546  qhangup(c->eq, nil);
547  qflush(c->rq);
548 
549  switch(tcb->state) {
550  case Listen:
551  /*
552  * reset any incoming calls to this listener
553  */
554  Fsconnected(c, "Hangup");
555 
556  localclose(c, nil);
557  break;
558  case Closed:
559  case Syn_sent:
560  localclose(c, nil);
561  break;
562  case Syn_received:
563  case Established:
564  tcb->flgcnt++;
565  tcb->snd.nxt++;
566  tcpsetstate(c, Finwait1);
567  tcpoutput(c);
568  break;
569  case Close_wait:
570  tcb->flgcnt++;
571  tcb->snd.nxt++;
572  tcpsetstate(c, Last_ack);
573  tcpoutput(c);
574  break;
575  }
576 }
577 
578 static void
579 tcpkick(void *x)
580 {
581  Conv *s = x;
582  Tcpctl *tcb;
583 
584  tcb = (Tcpctl*)s->ptcl;
585 
586  if(waserror()){
587  qunlock(s);
588  nexterror();
589  }
590  qlock(s);
591 
592  switch(tcb->state) {
593  case Syn_sent:
594  case Syn_received:
595  case Established:
596  case Close_wait:
597  /*
598  * Push data
599  */
600  tcpoutput(s);
601  break;
602  default:
603  localclose(s, "Hangup");
604  break;
605  }
606 
607  qunlock(s);
608  poperror();
609 }
610 
611 static int seq_lt(ulong, ulong);
612 
613 static void
614 tcprcvwin(Conv *s) /* Call with tcb locked */
615 {
616  int w;
617  Tcpctl *tcb;
618 
619  tcb = (Tcpctl*)s->ptcl;
620  w = tcb->window - qlen(s->rq);
621  if(w < 0)
622  w = 0;
623  /* RFC 1122 § 4.2.2.17 do not move right edge of window left */
624  if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
625  w = tcb->rcv.wptr - tcb->rcv.nxt;
626  if(w != tcb->rcv.wnd)
627  if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
628  tcb->rcv.blocked = 1;
629  netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
630  tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
631  }
632  tcb->rcv.wnd = w;
633  tcb->rcv.wptr = tcb->rcv.nxt + w;
634 }
635 
636 static void
637 tcpacktimer(void *v)
638 {
639  Tcpctl *tcb;
640  Conv *s;
641 
642  s = v;
643  tcb = (Tcpctl*)s->ptcl;
644 
645  if(waserror()){
646  qunlock(s);
647  nexterror();
648  }
649  qlock(s);
650  if(tcb->state != Closed){
651  tcb->flags |= FORCE;
652  tcpoutput(s);
653  }
654  qunlock(s);
655  poperror();
656 }
657 
658 static void
659 tcpcongestion(Tcpctl *tcb)
660 {
661  ulong inflight;
662 
663  inflight = tcb->snd.nxt - tcb->snd.una;
664  if(inflight > tcb->cwind)
665  inflight = tcb->cwind;
666  tcb->ssthresh = inflight / 2;
667  if(tcb->ssthresh < 2*tcb->mss)
668  tcb->ssthresh = 2*tcb->mss;
669 }
670 
671 enum {
672  L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */
673 };
674 
675 static void
676 tcpabcincr(Tcpctl *tcb, uint acked)
677 {
678  uint limit;
679 
680  tcb->abcbytes += acked;
681  if(tcb->cwind < tcb->ssthresh){
682  /* slow start */
683  if(tcb->snd.rto)
684  limit = 1*tcb->mss;
685  else
686  limit = L*tcb->mss;
687  tcb->cwind += MIN(tcb->abcbytes, limit);
688  tcb->abcbytes = 0;
689  }
690  else{
691  tcb->snd.rto = 0;
692  /* avoidance */
693  if(tcb->abcbytes >= tcb->cwind){
694  tcb->abcbytes -= tcb->cwind;
695  tcb->cwind += tcb->mss;
696  }
697  }
698 }
699 
700 static void
701 tcpcreate(Conv *c)
702 {
703  c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
704  c->wq = qopen(QMAX, Qkick, tcpkick, c);
705 }
706 
707 static void
708 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
709 {
710  if(newstate != TcptimerON){
711  if(t->state == TcptimerON){
712  /* unchain */
713  if(priv->timers == t){
714  priv->timers = t->next;
715  if(t->prev != nil)
716  panic("timerstate1");
717  }
718  if(t->next)
719  t->next->prev = t->prev;
720  if(t->prev)
721  t->prev->next = t->next;
722  t->next = t->prev = nil;
723  }
724  } else {
725  if(t->state != TcptimerON){
726  /* chain */
727  if(t->prev != nil || t->next != nil)
728  panic("timerstate2");
729  t->prev = nil;
730  t->next = priv->timers;
731  if(t->next)
732  t->next->prev = t;
733  priv->timers = t;
734  }
735  }
736  t->state = newstate;
737 }
738 
739 static void
740 tcpackproc(void *a)
741 {
742  Tcptimer *t, *tp, *timeo;
743  Proto *tcp;
744  Tcppriv *priv;
745  int loop;
746 
747  tcp = a;
748  priv = tcp->priv;
749 
750  while(waserror())
751  ;
752 
753  for(;;) {
754  tsleep(&up->sleep, return0, 0, MSPTICK);
755 
756  qlock(&priv->tl);
757  timeo = nil;
758  loop = 0;
759  for(t = priv->timers; t != nil; t = tp) {
760  if(loop++ > 10000)
761  panic("tcpackproc1");
762  tp = t->next;
763  if(t->state == TcptimerON) {
764  t->count--;
765  if(t->count == 0) {
766  timerstate(priv, t, TcptimerDONE);
767  t->readynext = timeo;
768  timeo = t;
769  }
770  }
771  }
772  qunlock(&priv->tl);
773 
774  loop = 0;
775  for(t = timeo; t != nil; t = t->readynext) {
776  if(loop++ > 10000)
777  panic("tcpackproc2");
778  if(t->state == TcptimerDONE && t->func != nil && !waserror()){
779  (*t->func)(t->arg);
780  poperror();
781  }
782  }
783 
784  limborexmit(tcp);
785  }
786 }
787 
788 static void
789 tcpgo(Tcppriv *priv, Tcptimer *t)
790 {
791  if(t == nil || t->start == 0)
792  return;
793 
794  qlock(&priv->tl);
795  t->count = t->start;
796  timerstate(priv, t, TcptimerON);
797  qunlock(&priv->tl);
798 }
799 
800 static void
801 tcphalt(Tcppriv *priv, Tcptimer *t)
802 {
803  if(t == nil)
804  return;
805 
806  qlock(&priv->tl);
807  timerstate(priv, t, TcptimerOFF);
808  qunlock(&priv->tl);
809 }
810 
811 static int
812 backoff(int n)
813 {
814  return 1 << n;
815 }
816 
817 static void
818 localclose(Conv *s, char *reason) /* called with tcb locked */
819 {
820  Tcpctl *tcb;
821  Tcppriv *tpriv;
822 
823  tpriv = s->p->priv;
824  tcb = (Tcpctl*)s->ptcl;
825 
826  iphtrem(&tpriv->ht, s);
827 
828  tcphalt(tpriv, &tcb->timer);
829  tcphalt(tpriv, &tcb->rtt_timer);
830  tcphalt(tpriv, &tcb->acktimer);
831  tcphalt(tpriv, &tcb->katimer);
832 
833  /* Flush reassembly queue; nothing more can arrive */
834  dumpreseq(tcb);
835 
836  if(tcb->state == Syn_sent)
837  Fsconnected(s, reason);
838  if(s->state == Announced)
839  wakeup(&s->listenr);
840 
841  qhangup(s->rq, reason);
842  qhangup(s->wq, reason);
843 
844  tcpsetstate(s, Closed);
845 }
846 
847 /* mtu (- TCP + IP hdr len) of 1st hop */
848 static int
849 tcpmtu(Route *r, int version, uint *scale)
850 {
851  Ipifc *ifc;
852  int mtu;
853 
854  /*
855  * set the ws. it doesn't commit us to anything.
856  * ws is the ultimate limit to the bandwidth-delay product.
857  */
858  *scale = Defadvscale;
859 
860  /*
861  * currently we do not implement path MTU discovery
862  * so use interface MTU *only* if directly reachable
863  * or when we use V4 which allows routers to fragment.
864  * otherwise, we use the default MSS which assumes a
865  * safe minimum MTU of 1280 bytes for V6.
866  */
867  if(r != nil && (ifc = r->ifc) != nil){
868  mtu = ifc->maxtu - ifc->m->hsize;
869  if(version == V4)
870  return mtu - (TCP4_PKT + TCP4_HDRSIZE);
871  mtu -= TCP6_PKT + TCP6_HDRSIZE;
872  if((r->type & (Rifc|Runi)) != 0 || mtu <= DEF_MSS6)
873  return mtu;
874  }
875  if(version == V6)
876  return DEF_MSS6;
877  else
878  return DEF_MSS;
879 }
880 
881 static void
882 inittcpctl(Conv *s, int mode)
883 {
884  Tcpctl *tcb;
885  Tcp4hdr* h4;
886  Tcp6hdr* h6;
887  Tcppriv *tpriv;
888  int mss;
889 
890  tcb = (Tcpctl*)s->ptcl;
891 
892  memset(tcb, 0, sizeof(Tcpctl));
893 
894  tcb->ssthresh = QMAX; /* reset by tcpsetscale() */
895  tcb->srtt = tcp_irtt<<LOGAGAIN;
896  tcb->mdev = 0;
897 
898  /* setup timers */
899  tcb->timer.start = tcp_irtt / MSPTICK;
900  tcb->timer.func = tcptimeout;
901  tcb->timer.arg = s;
902  tcb->rtt_timer.start = MAX_TIME;
903  tcb->acktimer.start = TCP_ACK / MSPTICK;
904  tcb->acktimer.func = tcpacktimer;
905  tcb->acktimer.arg = s;
906  tcb->katimer.start = DEF_KAT / MSPTICK;
907  tcb->katimer.func = tcpkeepalive;
908  tcb->katimer.arg = s;
909 
910  mss = DEF_MSS;
911 
912  /* create a prototype(pseudo) header */
913  if(mode != TCP_LISTEN){
914  if(ipcmp(s->laddr, IPnoaddr) == 0)
915  findlocalip(s->p->f, s->laddr, s->raddr);
916 
917  switch(s->ipversion){
918  case V4:
919  h4 = &tcb->protohdr.tcp4hdr;
920  memset(h4, 0, sizeof(*h4));
921  h4->proto = IP_TCPPROTO;
922  hnputs(h4->tcpsport, s->lport);
923  hnputs(h4->tcpdport, s->rport);
924  v6tov4(h4->tcpsrc, s->laddr);
925  v6tov4(h4->tcpdst, s->raddr);
926  break;
927  case V6:
928  h6 = &tcb->protohdr.tcp6hdr;
929  memset(h6, 0, sizeof(*h6));
930  h6->proto = IP_TCPPROTO;
931  hnputs(h6->tcpsport, s->lport);
932  hnputs(h6->tcpdport, s->rport);
933  ipmove(h6->tcpsrc, s->laddr);
934  ipmove(h6->tcpdst, s->raddr);
935  mss = DEF_MSS6;
936  break;
937  default:
938  panic("inittcpctl: version %d", s->ipversion);
939  }
940  }
941 
942  tcb->mss = tcb->cwind = mss;
943  tcb->abcbytes = 0;
944  tpriv = s->p->priv;
945  tpriv->stats[Mss] = tcb->mss;
946 
947  /* default is no window scaling */
948  tcpsetscale(s, tcb, 0, 0);
949 }
950 
951 /*
952  * called with s qlocked
953  */
954 static void
955 tcpstart(Conv *s, int mode)
956 {
957  Tcpctl *tcb;
958  Tcppriv *tpriv;
959  char kpname[KNAMELEN];
960 
961  tpriv = s->p->priv;
962 
963  if(tpriv->ackprocstarted == 0){
964  qlock(&tpriv->apl);
965  if(tpriv->ackprocstarted == 0){
966  snprint(kpname, sizeof(kpname), "#I%dtcpack", s->p->f->dev);
967  kproc(kpname, tcpackproc, s->p);
968  tpriv->ackprocstarted = 1;
969  }
970  qunlock(&tpriv->apl);
971  }
972 
973  tcb = (Tcpctl*)s->ptcl;
974 
975  inittcpctl(s, mode);
976 
977  iphtadd(&tpriv->ht, s);
978  switch(mode) {
979  case TCP_LISTEN:
980  tpriv->stats[PassiveOpens]++;
981  tcb->flags |= CLONE;
982  tcpsetstate(s, Listen);
983  break;
984 
985  case TCP_CONNECT:
986  tpriv->stats[ActiveOpens]++;
987  tcb->flags |= ACTIVE;
988  tcpsndsyn(s, tcb);
989  tcpsetstate(s, Syn_sent);
990  tcpoutput(s);
991  break;
992  }
993 }
994 
995 static char*
996 tcpflag(char *buf, char *e, ushort flag)
997 {
998  char *p;
999 
1000  p = seprint(buf, e, "%d", flag>>10); /* Head len */
1001  if(flag & URG)
1002  p = seprint(p, e, " URG");
1003  if(flag & ACK)
1004  p = seprint(p, e, " ACK");
1005  if(flag & PSH)
1006  p = seprint(p, e, " PSH");
1007  if(flag & RST)
1008  p = seprint(p, e, " RST");
1009  if(flag & SYN)
1010  p = seprint(p, e, " SYN");
1011  if(flag & FIN)
1012  p = seprint(p, e, " FIN");
1013  USED(p);
1014  return buf;
1015 }
1016 
1017 static Block*
1018 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1019 {
1020  int dlen;
1021  Tcp6hdr *h;
1022  ushort csum;
1023  ushort hdrlen, optpad = 0;
1024  uchar *opt;
1025 
1026  hdrlen = TCP6_HDRSIZE;
1027  if(tcph->flags & SYN){
1028  if(tcph->mss)
1029  hdrlen += MSS_LENGTH;
1030  if(tcph->ws)
1031  hdrlen += WS_LENGTH;
1032  optpad = hdrlen & 3;
1033  if(optpad)
1034  optpad = 4 - optpad;
1035  hdrlen += optpad;
1036  }
1037 
1038  if(data) {
1039  dlen = blocklen(data);
1040  data = padblock(data, hdrlen + TCP6_PKT);
1041  }
1042  else {
1043  dlen = 0;
1044  data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
1045  data->wp += hdrlen + TCP6_PKT;
1046  }
1047 
1048  /* copy in pseudo ip header plus port numbers */
1049  h = (Tcp6hdr *)(data->rp);
1050  memmove(h, ph, TCP6_TCBPHDRSZ);
1051 
1052  /* compose pseudo tcp header, do cksum calculation */
1053  hnputl(h->vcf, hdrlen + dlen);
1054  h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1055  h->ttl = ph->proto;
1056 
1057  /* copy in variable bits */
1058  hnputl(h->tcpseq, tcph->seq);
1059  hnputl(h->tcpack, tcph->ack);
1060  hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1061  hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1062  hnputs(h->tcpurg, tcph->urg);
1063 
1064  if(tcph->flags & SYN){
1065  opt = h->tcpopt;
1066  if(tcph->mss != 0){
1067  *opt++ = MSSOPT;
1068  *opt++ = MSS_LENGTH;
1069  hnputs(opt, tcph->mss);
1070  opt += 2;
1071  }
1072  if(tcph->ws != 0){
1073  *opt++ = WSOPT;
1074  *opt++ = WS_LENGTH;
1075  *opt++ = tcph->ws;
1076  }
1077  while(optpad-- > 0)
1078  *opt++ = NOOPOPT;
1079  }
1080 
1081  if(tcb != nil && tcb->nochecksum){
1082  h->tcpcksum[0] = h->tcpcksum[1] = 0;
1083  } else {
1084  csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1085  hnputs(h->tcpcksum, csum);
1086  }
1087 
1088  /* move from pseudo header back to normal ip header */
1089  memset(h->vcf, 0, 4);
1090  h->vcf[0] = IP_VER6;
1091  hnputs(h->ploadlen, hdrlen+dlen);
1092  h->proto = ph->proto;
1093 
1094  return data;
1095 }
1096 
1097 static Block*
1098 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1099 {
1100  int dlen;
1101  Tcp4hdr *h;
1102  ushort csum;
1103  ushort hdrlen, optpad = 0;
1104  uchar *opt;
1105 
1106  hdrlen = TCP4_HDRSIZE;
1107  if(tcph->flags & SYN){
1108  if(tcph->mss)
1109  hdrlen += MSS_LENGTH;
1110  if(1)
1111  hdrlen += WS_LENGTH;
1112  optpad = hdrlen & 3;
1113  if(optpad)
1114  optpad = 4 - optpad;
1115  hdrlen += optpad;
1116  }
1117 
1118  if(data) {
1119  dlen = blocklen(data);
1120  data = padblock(data, hdrlen + TCP4_PKT);
1121  }
1122  else {
1123  dlen = 0;
1124  data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1125  data->wp += hdrlen + TCP4_PKT;
1126  }
1127 
1128  /* copy in pseudo ip header plus port numbers */
1129  h = (Tcp4hdr *)(data->rp);
1130  memmove(h, ph, TCP4_TCBPHDRSZ);
1131 
1132  /* copy in variable bits */
1133  hnputs(h->tcplen, hdrlen + dlen);
1134  hnputl(h->tcpseq, tcph->seq);
1135  hnputl(h->tcpack, tcph->ack);
1136  hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1137  hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1138  hnputs(h->tcpurg, tcph->urg);
1139 
1140  if(tcph->flags & SYN){
1141  opt = h->tcpopt;
1142  if(tcph->mss != 0){
1143  *opt++ = MSSOPT;
1144  *opt++ = MSS_LENGTH;
1145  hnputs(opt, tcph->mss);
1146  opt += 2;
1147  }
1148  /* always offer. rfc1323 §2.2 */
1149  if(1){
1150  *opt++ = WSOPT;
1151  *opt++ = WS_LENGTH;
1152  *opt++ = tcph->ws;
1153  }
1154  while(optpad-- > 0)
1155  *opt++ = NOOPOPT;
1156  }
1157 
1158  if(tcb != nil && tcb->nochecksum){
1159  h->tcpcksum[0] = h->tcpcksum[1] = 0;
1160  } else {
1161  csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1162  hnputs(h->tcpcksum, csum);
1163  }
1164 
1165  return data;
1166 }
1167 
1168 static int
1169 ntohtcp6(Tcp *tcph, Block **bpp)
1170 {
1171  Tcp6hdr *h;
1172  uchar *optr;
1173  ushort hdrlen;
1174  ushort optlen;
1175  int n;
1176 
1177  *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1178  if(*bpp == nil)
1179  return -1;
1180 
1181  h = (Tcp6hdr *)((*bpp)->rp);
1182  tcph->source = nhgets(h->tcpsport);
1183  tcph->dest = nhgets(h->tcpdport);
1184  tcph->seq = nhgetl(h->tcpseq);
1185  tcph->ack = nhgetl(h->tcpack);
1186  hdrlen = (h->tcpflag[0]>>2) & ~3;
1187  if(hdrlen < TCP6_HDRSIZE) {
1188  freeblist(*bpp);
1189  return -1;
1190  }
1191 
1192  tcph->flags = h->tcpflag[1];
1193  tcph->wnd = nhgets(h->tcpwin);
1194  tcph->urg = nhgets(h->tcpurg);
1195  tcph->mss = 0;
1196  tcph->ws = 0;
1197  tcph->update = 0;
1198  tcph->len = nhgets(h->ploadlen) - hdrlen;
1199 
1200  *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1201  if(*bpp == nil)
1202  return -1;
1203 
1204  optr = h->tcpopt;
1205  n = hdrlen - TCP6_HDRSIZE;
1206  while(n > 0 && *optr != EOLOPT) {
1207  if(*optr == NOOPOPT) {
1208  n--;
1209  optr++;
1210  continue;
1211  }
1212  optlen = optr[1];
1213  if(optlen < 2 || optlen > n)
1214  break;
1215  switch(*optr) {
1216  case MSSOPT:
1217  if(optlen == MSS_LENGTH)
1218  tcph->mss = nhgets(optr+2);
1219  break;
1220  case WSOPT:
1221  if(optlen == WS_LENGTH && *(optr+2) <= 14)
1222  tcph->ws = *(optr+2);
1223  break;
1224  }
1225  n -= optlen;
1226  optr += optlen;
1227  }
1228  return hdrlen;
1229 }
1230 
1231 static int
1232 ntohtcp4(Tcp *tcph, Block **bpp)
1233 {
1234  Tcp4hdr *h;
1235  uchar *optr;
1236  ushort hdrlen;
1237  ushort optlen;
1238  int n;
1239 
1240  *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1241  if(*bpp == nil)
1242  return -1;
1243 
1244  h = (Tcp4hdr *)((*bpp)->rp);
1245  tcph->source = nhgets(h->tcpsport);
1246  tcph->dest = nhgets(h->tcpdport);
1247  tcph->seq = nhgetl(h->tcpseq);
1248  tcph->ack = nhgetl(h->tcpack);
1249 
1250  hdrlen = (h->tcpflag[0]>>2) & ~3;
1251  if(hdrlen < TCP4_HDRSIZE) {
1252  freeblist(*bpp);
1253  return -1;
1254  }
1255 
1256  tcph->flags = h->tcpflag[1];
1257  tcph->wnd = nhgets(h->tcpwin);
1258  tcph->urg = nhgets(h->tcpurg);
1259  tcph->mss = 0;
1260  tcph->ws = 0;
1261  tcph->update = 0;
1262  tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1263 
1264  *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1265  if(*bpp == nil)
1266  return -1;
1267 
1268  optr = h->tcpopt;
1269  n = hdrlen - TCP4_HDRSIZE;
1270  while(n > 0 && *optr != EOLOPT) {
1271  if(*optr == NOOPOPT) {
1272  n--;
1273  optr++;
1274  continue;
1275  }
1276  optlen = optr[1];
1277  if(optlen < 2 || optlen > n)
1278  break;
1279  switch(*optr) {
1280  case MSSOPT:
1281  if(optlen == MSS_LENGTH)
1282  tcph->mss = nhgets(optr+2);
1283  break;
1284  case WSOPT:
1285  if(optlen == WS_LENGTH && *(optr+2) <= 14)
1286  tcph->ws = *(optr+2);
1287  break;
1288  }
1289  n -= optlen;
1290  optr += optlen;
1291  }
1292  return hdrlen;
1293 }
1294 
1295 /*
1296  * For outgoing calls, generate an initial sequence
1297  * number and put a SYN on the send queue
1298  */
1299 static void
1300 tcpsndsyn(Conv *s, Tcpctl *tcb)
1301 {
1302  Tcppriv *tpriv;
1303 
1304  tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1305  tcb->rttseq = tcb->iss;
1306  tcb->snd.wl2 = tcb->iss;
1307  tcb->snd.una = tcb->iss;
1308  tcb->snd.rxt = tcb->iss;
1309  tcb->snd.ptr = tcb->rttseq;
1310  tcb->snd.nxt = tcb->rttseq;
1311  tcb->flgcnt++;
1312  tcb->flags |= FORCE;
1313  tcb->sndsyntime = NOW;
1314 
1315  /* set desired mss and scale */
1316  tcb->mss = tcpmtu(v6lookup(s->p->f, s->raddr, s->laddr, s), s->ipversion, &tcb->scale);
1317  tpriv = s->p->priv;
1318  tpriv->stats[Mss] = tcb->mss;
1319 }
1320 
1321 void
1322 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1323 {
1324  Block *hbp;
1325  uchar rflags;
1326  Tcppriv *tpriv;
1327  Tcp4hdr ph4;
1328  Tcp6hdr ph6;
1329 
1330  netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1331 
1332  tpriv = tcp->priv;
1333 
1334  if(seg->flags & RST)
1335  return;
1336 
1337  /* make pseudo header */
1338  switch(version) {
1339  case V4:
1340  memset(&ph4, 0, sizeof(ph4));
1341  ph4.vihl = IP_VER4;
1342  v6tov4(ph4.tcpsrc, dest);
1343  v6tov4(ph4.tcpdst, source);
1344  ph4.proto = IP_TCPPROTO;
1345  hnputs(ph4.tcplen, TCP4_HDRSIZE);
1346  hnputs(ph4.tcpsport, seg->dest);
1347  hnputs(ph4.tcpdport, seg->source);
1348  break;
1349  case V6:
1350  memset(&ph6, 0, sizeof(ph6));
1351  ph6.vcf[0] = IP_VER6;
1352  ipmove(ph6.tcpsrc, dest);
1353  ipmove(ph6.tcpdst, source);
1354  ph6.proto = IP_TCPPROTO;
1355  hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1356  hnputs(ph6.tcpsport, seg->dest);
1357  hnputs(ph6.tcpdport, seg->source);
1358  break;
1359  default:
1360  panic("sndrst: version %d", version);
1361  }
1362 
1363  tpriv->stats[OutRsts]++;
1364  rflags = RST;
1365 
1366  /* convince the other end that this reset is in band */
1367  if(seg->flags & ACK) {
1368  seg->seq = seg->ack;
1369  seg->ack = 0;
1370  }
1371  else {
1372  rflags |= ACK;
1373  seg->ack = seg->seq;
1374  seg->seq = 0;
1375  if(seg->flags & SYN)
1376  seg->ack++;
1377  seg->ack += length;
1378  if(seg->flags & FIN)
1379  seg->ack++;
1380  }
1381  seg->flags = rflags;
1382  seg->wnd = 0;
1383  seg->urg = 0;
1384  seg->mss = 0;
1385  seg->ws = 0;
1386  switch(version) {
1387  case V4:
1388  hbp = htontcp4(seg, nil, &ph4, nil);
1389  if(hbp == nil)
1390  return;
1391  ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1392  break;
1393  case V6:
1394  hbp = htontcp6(seg, nil, &ph6, nil);
1395  if(hbp == nil)
1396  return;
1397  ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1398  break;
1399  default:
1400  panic("sndrst2: version %d", version);
1401  }
1402 }
1403 
1404 /*
1405  * send a reset to the remote side and close the conversation
1406  * called with s qlocked
1407  */
1408 static char*
1409 tcphangup(Conv *s)
1410 {
1411  Tcp seg;
1412  Tcpctl *tcb;
1413  Block *hbp;
1414 
1415  tcb = (Tcpctl*)s->ptcl;
1416  if(waserror())
1417  return commonerror();
1418  if(ipcmp(s->raddr, IPnoaddr) != 0) {
1419  if(!waserror()){
1420  memset(&seg, 0, sizeof seg);
1421  seg.flags = RST | ACK;
1422  seg.ack = tcb->rcv.nxt;
1423  tcb->rcv.ackptr = seg.ack;
1424  seg.seq = tcb->snd.ptr;
1425  seg.wnd = 0;
1426  seg.urg = 0;
1427  seg.mss = 0;
1428  seg.ws = 0;
1429  switch(s->ipversion) {
1430  case V4:
1431  tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1432  hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1433  ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1434  break;
1435  case V6:
1436  tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1437  hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1438  ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1439  break;
1440  default:
1441  panic("tcphangup: version %d", s->ipversion);
1442  }
1443  poperror();
1444  }
1445  }
1446  localclose(s, nil);
1447  poperror();
1448  return nil;
1449 }
1450 
1451 /*
1452  * (re)send a SYN ACK
1453  */
1454 static int
1455 sndsynack(Proto *tcp, Limbo *lp)
1456 {
1457  Block *hbp;
1458  Tcp4hdr ph4;
1459  Tcp6hdr ph6;
1460  Tcp seg;
1461  uint scale;
1462 
1463  /* make pseudo header */
1464  switch(lp->version) {
1465  case V4:
1466  memset(&ph4, 0, sizeof(ph4));
1467  ph4.vihl = IP_VER4;
1468  v6tov4(ph4.tcpsrc, lp->laddr);
1469  v6tov4(ph4.tcpdst, lp->raddr);
1470  ph4.proto = IP_TCPPROTO;
1471  hnputs(ph4.tcplen, TCP4_HDRSIZE);
1472  hnputs(ph4.tcpsport, lp->lport);
1473  hnputs(ph4.tcpdport, lp->rport);
1474  break;
1475  case V6:
1476  memset(&ph6, 0, sizeof(ph6));
1477  ph6.vcf[0] = IP_VER6;
1478  ipmove(ph6.tcpsrc, lp->laddr);
1479  ipmove(ph6.tcpdst, lp->raddr);
1480  ph6.proto = IP_TCPPROTO;
1481  hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1482  hnputs(ph6.tcpsport, lp->lport);
1483  hnputs(ph6.tcpdport, lp->rport);
1484  break;
1485  default:
1486  panic("sndrst: version %d", lp->version);
1487  }
1488 
1489  memset(&seg, 0, sizeof seg);
1490  seg.seq = lp->iss;
1491  seg.ack = lp->irs+1;
1492  seg.flags = SYN|ACK;
1493  seg.urg = 0;
1494  seg.mss = tcpmtu(v6lookup(tcp->f, lp->raddr, lp->laddr, nil), lp->version, &scale);
1495  seg.wnd = QMAX;
1496 
1497  /* if the other side set scale, we should too */
1498  if(lp->rcvscale){
1499  seg.ws = scale;
1500  lp->sndscale = scale;
1501  } else {
1502  seg.ws = 0;
1503  lp->sndscale = 0;
1504  }
1505 
1506  switch(lp->version) {
1507  case V4:
1508  hbp = htontcp4(&seg, nil, &ph4, nil);
1509  if(hbp == nil)
1510  return -1;
1511  ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1512  break;
1513  case V6:
1514  hbp = htontcp6(&seg, nil, &ph6, nil);
1515  if(hbp == nil)
1516  return -1;
1517  ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1518  break;
1519  default:
1520  panic("sndsnack: version %d", lp->version);
1521  }
1522  lp->lastsend = NOW;
1523  return 0;
1524 }
1525 
1526 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1527 
1528 /*
1529  * put a call into limbo and respond with a SYN ACK
1530  *
1531  * called with proto locked
1532  */
1533 static void
1534 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1535 {
1536  Limbo *lp, **l;
1537  Tcppriv *tpriv;
1538  int h;
1539 
1540  tpriv = s->p->priv;
1541  h = hashipa(source, seg->source);
1542 
1543  for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1544  lp = *l;
1545  if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1546  continue;
1547  if(ipcmp(lp->raddr, source) != 0)
1548  continue;
1549  if(ipcmp(lp->laddr, dest) != 0)
1550  continue;
1551 
1552  /* each new SYN restarts the retransmits */
1553  lp->irs = seg->seq;
1554  break;
1555  }
1556  lp = *l;
1557  if(lp == nil){
1558  if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1559  lp = tpriv->lht[h];
1560  tpriv->lht[h] = lp->next;
1561  lp->next = nil;
1562  } else {
1563  lp = malloc(sizeof(*lp));
1564  if(lp == nil)
1565  return;
1566  tpriv->nlimbo++;
1567  }
1568  *l = lp;
1569  lp->version = version;
1570  ipmove(lp->laddr, dest);
1571  ipmove(lp->raddr, source);
1572  lp->lport = seg->dest;
1573  lp->rport = seg->source;
1574  lp->mss = seg->mss;
1575  lp->rcvscale = seg->ws;
1576  lp->irs = seg->seq;
1577  lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1578  }
1579 
1580  if(sndsynack(s->p, lp) < 0){
1581  *l = lp->next;
1582  tpriv->nlimbo--;
1583  free(lp);
1584  }
1585 }
1586 
1587 /*
1588  * resend SYN ACK's once every SYNACK_RXTIMER ms.
1589  */
1590 static void
1591 limborexmit(Proto *tcp)
1592 {
1593  Tcppriv *tpriv;
1594  Limbo **l, *lp;
1595  int h;
1596  int seen;
1597  ulong now;
1598 
1599  tpriv = tcp->priv;
1600 
1601  if(!canqlock(tcp))
1602  return;
1603  seen = 0;
1604  now = NOW;
1605  for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1606  for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1607  lp = *l;
1608  seen++;
1609  if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1610  continue;
1611 
1612  /* time it out after 1 second */
1613  if(++(lp->rexmits) > 5){
1614  tpriv->nlimbo--;
1615  *l = lp->next;
1616  free(lp);
1617  continue;
1618  }
1619 
1620  /* if we're being attacked, don't bother resending SYN ACK's */
1621  if(tpriv->nlimbo > 100)
1622  continue;
1623 
1624  if(sndsynack(tcp, lp) < 0){
1625  tpriv->nlimbo--;
1626  *l = lp->next;
1627  free(lp);
1628  continue;
1629  }
1630 
1631  l = &lp->next;
1632  }
1633  }
1634  qunlock(tcp);
1635 }
1636 
1637 /*
1638  * lookup call in limbo. if found, throw it out.
1639  *
1640  * called with proto locked
1641  */
1642 static void
1643 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1644 {
1645  Limbo *lp, **l;
1646  int h;
1647  Tcppriv *tpriv;
1648 
1649  tpriv = s->p->priv;
1650 
1651  /* find a call in limbo */
1652  h = hashipa(src, segp->source);
1653  for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1654  lp = *l;
1655  if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1656  continue;
1657  if(ipcmp(lp->laddr, dst) != 0)
1658  continue;
1659  if(ipcmp(lp->raddr, src) != 0)
1660  continue;
1661 
1662  /* RST can only follow the SYN */
1663  if(segp->seq == lp->irs+1){
1664  tpriv->nlimbo--;
1665  *l = lp->next;
1666  free(lp);
1667  }
1668  break;
1669  }
1670 }
1671 
1672 static void
1673 initialwindow(Tcpctl *tcb)
1674 {
1675  /* RFC 3390 initial window */
1676  if(tcb->mss < 1095)
1677  tcb->cwind = 4*tcb->mss;
1678  else if(tcb->mss < 2190)
1679  tcb->cwind = 4380;
1680  else
1681  tcb->cwind = 2*tcb->mss;
1682 }
1683 
1684 /*
1685  * come here when we finally get an ACK to our SYN-ACK.
1686  * lookup call in limbo. if found, create a new conversation
1687  *
1688  * called with proto locked
1689  */
1690 static Conv*
1691 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1692 {
1693  Conv *new;
1694  Tcpctl *tcb;
1695  Tcppriv *tpriv;
1696  Tcp4hdr *h4;
1697  Tcp6hdr *h6;
1698  Limbo *lp, **l;
1699  int h;
1700 
1701  /* unless it's just an ack, it can't be someone coming out of limbo */
1702  if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1703  return nil;
1704 
1705  tpriv = s->p->priv;
1706 
1707  /* find a call in limbo */
1708  h = hashipa(src, segp->source);
1709  for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1710  netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1711  src, segp->source, lp->raddr, lp->rport,
1712  dst, segp->dest, lp->laddr, lp->lport,
1713  version, lp->version
1714  );
1715 
1716  if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1717  continue;
1718  if(ipcmp(lp->laddr, dst) != 0)
1719  continue;
1720  if(ipcmp(lp->raddr, src) != 0)
1721  continue;
1722 
1723  /* we're assuming no data with the initial SYN */
1724  if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1725  netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1726  segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1727  lp = nil;
1728  } else {
1729  tpriv->nlimbo--;
1730  *l = lp->next;
1731  }
1732  break;
1733  }
1734  if(lp == nil)
1735  return nil;
1736 
1737  new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1738  if(new == nil)
1739  return nil;
1740 
1741  memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1742  tcb = (Tcpctl*)new->ptcl;
1743  tcb->flags &= ~CLONE;
1744  tcb->timer.arg = new;
1745  tcb->timer.state = TcptimerOFF;
1746  tcb->acktimer.arg = new;
1747  tcb->acktimer.state = TcptimerOFF;
1748  tcb->katimer.arg = new;
1749  tcb->katimer.state = TcptimerOFF;
1750  tcb->rtt_timer.arg = new;
1751  tcb->rtt_timer.state = TcptimerOFF;
1752 
1753  tcb->irs = lp->irs;
1754  tcb->rcv.nxt = tcb->irs+1;
1755  tcb->rcv.wptr = tcb->rcv.nxt;
1756  tcb->rcv.wsnt = 0;
1757  tcb->rcv.urg = tcb->rcv.nxt;
1758 
1759  tcb->iss = lp->iss;
1760  tcb->rttseq = tcb->iss;
1761  tcb->snd.wl2 = tcb->iss;
1762  tcb->snd.una = tcb->iss+1;
1763  tcb->snd.ptr = tcb->iss+1;
1764  tcb->snd.nxt = tcb->iss+1;
1765  tcb->snd.rxt = tcb->iss+1;
1766  tcb->flgcnt = 0;
1767  tcb->flags |= SYNACK;
1768 
1769  /* set desired mss and scale */
1770  tcb->mss = tcpmtu(v6lookup(s->p->f, src, dst, s), version, &tcb->scale);
1771 
1772  /* our sending max segment size cannot be bigger than what he asked for */
1773  if(lp->mss != 0 && lp->mss < tcb->mss)
1774  tcb->mss = lp->mss;
1775  tpriv->stats[Mss] = tcb->mss;
1776 
1777  /* window scaling */
1778  tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1779 
1780  /* congestion window */
1781  tcb->snd.wnd = segp->wnd;
1782  initialwindow(tcb);
1783 
1784  /* set initial round trip time */
1785  tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1786  tcpsynackrtt(new);
1787 
1788  free(lp);
1789 
1790  /* set up proto header */
1791  switch(version){
1792  case V4:
1793  h4 = &tcb->protohdr.tcp4hdr;
1794  memset(h4, 0, sizeof(*h4));
1795  h4->proto = IP_TCPPROTO;
1796  hnputs(h4->tcpsport, new->lport);
1797  hnputs(h4->tcpdport, new->rport);
1798  v6tov4(h4->tcpsrc, dst);
1799  v6tov4(h4->tcpdst, src);
1800  break;
1801  case V6:
1802  h6 = &tcb->protohdr.tcp6hdr;
1803  memset(h6, 0, sizeof(*h6));
1804  h6->proto = IP_TCPPROTO;
1805  hnputs(h6->tcpsport, new->lport);
1806  hnputs(h6->tcpdport, new->rport);
1807  ipmove(h6->tcpsrc, dst);
1808  ipmove(h6->tcpdst, src);
1809  break;
1810  default:
1811  panic("tcpincoming: version %d", new->ipversion);
1812  }
1813 
1814  tcpsetstate(new, Established);
1815 
1816  iphtadd(&tpriv->ht, new);
1817 
1818  return new;
1819 }
1820 
1821 static int
1822 seq_within(ulong x, ulong low, ulong high)
1823 {
1824  if(low <= high){
1825  if(low <= x && x <= high)
1826  return 1;
1827  }
1828  else {
1829  if(x >= low || x <= high)
1830  return 1;
1831  }
1832  return 0;
1833 }
1834 
1835 static int
1836 seq_lt(ulong x, ulong y)
1837 {
1838  return (int)(x-y) < 0;
1839 }
1840 
1841 static int
1842 seq_le(ulong x, ulong y)
1843 {
1844  return (int)(x-y) <= 0;
1845 }
1846 
1847 static int
1848 seq_gt(ulong x, ulong y)
1849 {
1850  return (int)(x-y) > 0;
1851 }
1852 
1853 static int
1854 seq_ge(ulong x, ulong y)
1855 {
1856  return (int)(x-y) >= 0;
1857 }
1858 
1859 /*
1860  * use the time between the first SYN and it's ack as the
1861  * initial round trip time
1862  */
1863 static void
1864 tcpsynackrtt(Conv *s)
1865 {
1866  Tcpctl *tcb;
1867  int delta;
1868  Tcppriv *tpriv;
1869 
1870  tcb = (Tcpctl*)s->ptcl;
1871  tpriv = s->p->priv;
1872 
1873  delta = NOW - tcb->sndsyntime;
1874  tcb->srtt = delta<<LOGAGAIN;
1875  tcb->mdev = delta<<LOGDGAIN;
1876 
1877  /* halt round trip timer */
1878  tcphalt(tpriv, &tcb->rtt_timer);
1879 }
1880 
1881 static void
1882 update(Conv *s, Tcp *seg)
1883 {
1884  int rtt, delta;
1885  Tcpctl *tcb;
1886  ulong acked;
1887  Tcppriv *tpriv;
1888 
1889  if(seg->update)
1890  return;
1891  seg->update = 1;
1892 
1893  tpriv = s->p->priv;
1894  tcb = (Tcpctl*)s->ptcl;
1895 
1896  /* catch zero-window updates, update window & recover */
1897  if(tcb->snd.wnd == 0 && seg->wnd > 0)
1898  if(seq_lt(seg->ack, tcb->snd.ptr)){
1899  netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1900  seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd);
1901  tcb->snd.wnd = seg->wnd;
1902  goto recovery;
1903  }
1904 
1905  /* newreno fast retransmit */
1906  if(seg->ack == tcb->snd.una)
1907  if(tcb->snd.una != tcb->snd.nxt)
1908  if(++tcb->snd.dupacks == 3){
1909 recovery:
1910  if(tcb->snd.recovery){
1911  tpriv->stats[RecoveryCwind]++;
1912  tcb->cwind += tcb->mss;
1913  }else if(seq_le(tcb->snd.rxt, seg->ack)){
1914  tpriv->stats[Recovery]++;
1915  tcb->abcbytes = 0;
1916  tcb->snd.recovery = 1;
1917  tcb->snd.partialack = 0;
1918  tcb->snd.rxt = tcb->snd.nxt;
1919  tcpcongestion(tcb);
1920  tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1921  netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1922  tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1923  tcprxmit(s);
1924  }else{
1925  tpriv->stats[RecoveryNoSeq]++;
1926  netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1927  tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1928  /* do not enter fast retransmit */
1929  /* do not change ssthresh */
1930  }
1931  }else if(tcb->snd.recovery){
1932  tpriv->stats[RecoveryCwind]++;
1933  tcb->cwind += tcb->mss;
1934  }
1935 
1936  /*
1937  * update window
1938  */
1939  if(seq_gt(seg->ack, tcb->snd.wl2)
1940  || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1941  /* clear dupack if we advance wl2 */
1942  if(tcb->snd.wl2 != seg->ack)
1943  tcb->snd.dupacks = 0;
1944  tcb->snd.wnd = seg->wnd;
1945  tcb->snd.wl2 = seg->ack;
1946  }
1947 
1948  if(!seq_gt(seg->ack, tcb->snd.una)){
1949  /*
1950  * don't let us hangup if sending into a closed window and
1951  * we're still getting acks
1952  */
1953  if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1954  tcb->backedoff = MAXBACKMS/4;
1955  return;
1956  }
1957 
1958  /* Compute the new send window size */
1959  acked = seg->ack - tcb->snd.una;
1960 
1961  /* avoid slow start and timers for SYN acks */
1962  if((tcb->flags & SYNACK) == 0) {
1963  tcb->flags |= SYNACK;
1964  acked--;
1965  tcb->flgcnt--;
1966  goto done;
1967  }
1968 
1969  /*
1970  * congestion control
1971  */
1972  if(tcb->snd.recovery){
1973  if(seq_ge(seg->ack, tcb->snd.rxt)){
1974  /* recovery finished; deflate window */
1975  tpriv->stats[RecoveryDone]++;
1976  tcb->snd.dupacks = 0;
1977  tcb->snd.recovery = 0;
1978  tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
1979  if(tcb->ssthresh < tcb->cwind)
1980  tcb->cwind = tcb->ssthresh;
1981  netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
1982  tcb->cwind, tcb->ssthresh);
1983  } else {
1984  /* partial ack; we lost more than one segment */
1985  tpriv->stats[RecoveryPA]++;
1986  if(tcb->cwind > acked)
1987  tcb->cwind -= acked;
1988  else{
1989  netlog(s->p->f, Logtcpwin, "partial ack neg\n");
1990  tcb->cwind = tcb->mss;
1991  }
1992  netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
1993  acked, tcb->snd.rxt - seg->ack, tcb->cwind);
1994 
1995  if(acked >= tcb->mss)
1996  tcb->cwind += tcb->mss;
1997  tcb->snd.partialack++;
1998  }
1999  } else
2000  tcpabcincr(tcb, acked);
2001 
2002  /* Adjust the timers according to the round trip time */
2003  /* todo: fix sloppy treatment of overflow cases here. */
2004  if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2005  tcphalt(tpriv, &tcb->rtt_timer);
2006  if((tcb->flags&RETRAN) == 0) {
2007  tcb->backoff = 0;
2008  tcb->backedoff = 0;
2009  rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2010  if(rtt == 0)
2011  rtt = 1; /* otherwise all close systems will rexmit in 0 time */
2012  rtt *= MSPTICK;
2013  if(tcb->srtt == 0) {
2014  tcb->srtt = rtt << LOGAGAIN;
2015  tcb->mdev = rtt << LOGDGAIN;
2016  } else {
2017  delta = rtt - (tcb->srtt>>LOGAGAIN);
2018  tcb->srtt += delta;
2019  if(tcb->srtt <= 0)
2020  tcb->srtt = 1;
2021 
2022  delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2023  tcb->mdev += delta;
2024  if(tcb->mdev <= 0)
2025  tcb->mdev = 1;
2026  }
2027  tcpsettimer(tcb);
2028  }
2029  }
2030 
2031 done:
2032  if(qdiscard(s->wq, acked) < acked)
2033  tcb->flgcnt--;
2034  tcb->snd.una = seg->ack;
2035 
2036  /* newreno fast recovery */
2037  if(tcb->snd.recovery)
2038  tcprxmit(s);
2039 
2040  if(seq_gt(seg->ack, tcb->snd.urg))
2041  tcb->snd.urg = seg->ack;
2042 
2043  if(tcb->snd.una != tcb->snd.nxt){
2044  /* “impatient” variant */
2045  if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2046  tcb->time = NOW;
2047  tcb->timeuna = tcb->snd.una;
2048  tcpgo(tpriv, &tcb->timer);
2049  }
2050  }
2051  else
2052  tcphalt(tpriv, &tcb->timer);
2053 
2054  if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2055  tcb->snd.ptr = tcb->snd.una;
2056 
2057  if(!tcb->snd.recovery)
2058  tcb->flags &= ~RETRAN;
2059  tcb->backoff = 0;
2060  tcb->backedoff = 0;
2061 }
2062 
2063 static void
2064 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2065 {
2066  Tcp seg;
2067  Tcp4hdr *h4;
2068  Tcp6hdr *h6;
2069  int hdrlen;
2070  Tcpctl *tcb;
2071  ushort length, csum;
2072  uchar source[IPaddrlen], dest[IPaddrlen];
2073  Conv *s;
2074  Fs *f;
2075  Tcppriv *tpriv;
2076  uchar version;
2077 
2078  f = tcp->f;
2079  tpriv = tcp->priv;
2080 
2081  tpriv->stats[InSegs]++;
2082 
2083  h4 = (Tcp4hdr*)(bp->rp);
2084  h6 = (Tcp6hdr*)(bp->rp);
2085 
2086  if((h4->vihl&0xF0)==IP_VER4) {
2087  version = V4;
2088  length = nhgets(h4->length);
2089  v4tov6(dest, h4->tcpdst);
2090  v4tov6(source, h4->tcpsrc);
2091 
2092  h4->Unused = 0;
2093  hnputs(h4->tcplen, length-TCP4_PKT);
2094  if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2095  ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2096  tpriv->stats[CsumErrs]++;
2097  tpriv->stats[InErrs]++;
2098  netlog(f, Logtcp, "bad tcp proto cksum\n");
2099  freeblist(bp);
2100  return;
2101  }
2102 
2103  hdrlen = ntohtcp4(&seg, &bp);
2104  if(hdrlen < 0){
2105  tpriv->stats[HlenErrs]++;
2106  tpriv->stats[InErrs]++;
2107  netlog(f, Logtcp, "bad tcp hdr len\n");
2108  return;
2109  }
2110 
2111  /* trim the packet to the size claimed by the datagram */
2112  length -= hdrlen+TCP4_PKT;
2113  bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2114  if(bp == nil){
2115  tpriv->stats[LenErrs]++;
2116  tpriv->stats[InErrs]++;
2117  netlog(f, Logtcp, "tcp len < 0 after trim\n");
2118  return;
2119  }
2120  }
2121  else {
2122  int ttl = h6->ttl;
2123  int proto = h6->proto;
2124 
2125  version = V6;
2126  length = nhgets(h6->ploadlen);
2127  ipmove(dest, h6->tcpdst);
2128  ipmove(source, h6->tcpsrc);
2129 
2130  h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2131  h6->ttl = proto;
2132  hnputl(h6->vcf, length);
2133  if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2134  (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2135  tpriv->stats[CsumErrs]++;
2136  tpriv->stats[InErrs]++;
2137  netlog(f, Logtcp,
2138  "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2139  h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2140  freeblist(bp);
2141  return;
2142  }
2143  h6->ttl = ttl;
2144  h6->proto = proto;
2145  hnputs(h6->ploadlen, length);
2146 
2147  hdrlen = ntohtcp6(&seg, &bp);
2148  if(hdrlen < 0){
2149  tpriv->stats[HlenErrs]++;
2150  tpriv->stats[InErrs]++;
2151  netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2152  return;
2153  }
2154 
2155  /* trim the packet to the size claimed by the datagram */
2156  length -= hdrlen;
2157  bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2158  if(bp == nil){
2159  tpriv->stats[LenErrs]++;
2160  tpriv->stats[InErrs]++;
2161  netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2162  return;
2163  }
2164  }
2165 
2166  /* lock protocol while searching for a conversation */
2167  qlock(tcp);
2168 
2169  /* Look for a matching conversation */
2170  s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2171  if(s == nil){
2172  netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2173  source, seg.source, dest, seg.dest);
2174 reset:
2175  qunlock(tcp);
2176  sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2177  freeblist(bp);
2178  return;
2179  }
2180 
2181  /* if it's a listener, look for the right flags and get a new conv */
2182  tcb = (Tcpctl*)s->ptcl;
2183  if(tcb->state == Listen){
2184  if(seg.flags & RST){
2185  limborst(s, &seg, source, dest, version);
2186  qunlock(tcp);
2187  freeblist(bp);
2188  return;
2189  }
2190 
2191  /* if this is a new SYN, put the call into limbo */
2192  if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2193  limbo(s, source, dest, &seg, version);
2194  qunlock(tcp);
2195  freeblist(bp);
2196  return;
2197  }
2198 
2199  /*
2200  * if there's a matching call in limbo, tcpincoming will
2201  * return it in state Syn_received
2202  */
2203  s = tcpincoming(s, &seg, source, dest, version);
2204  if(s == nil)
2205  goto reset;
2206  }
2207 
2208  /* The rest of the input state machine is run with the control block
2209  * locked and implements the state machine directly out of the RFC.
2210  * Out-of-band data is ignored - it was always a bad idea.
2211  */
2212  tcb = (Tcpctl*)s->ptcl;
2213  if(waserror()){
2214  qunlock(s);
2215  nexterror();
2216  }
2217  qlock(s);
2218  qunlock(tcp);
2219 
2220  /* fix up window */
2221  seg.wnd <<= tcb->rcv.scale;
2222 
2223  /* every input packet in puts off the keep alive time out */
2224  tcpsetkacounter(tcb);
2225 
2226  switch(tcb->state) {
2227  case Closed:
2228  sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2229  goto raise;
2230  case Syn_sent:
2231  if(seg.flags & ACK) {
2232  if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2233  sndrst(tcp, source, dest, length, &seg, version,
2234  "bad seq in Syn_sent");
2235  goto raise;
2236  }
2237  }
2238  if(seg.flags & RST) {
2239  if(seg.flags & ACK)
2240  localclose(s, Econrefused);
2241  goto raise;
2242  }
2243 
2244  if(seg.flags & SYN) {
2245  procsyn(s, &seg);
2246  if(seg.flags & ACK){
2247  update(s, &seg);
2248  tcpsynackrtt(s);
2249  tcpsetstate(s, Established);
2250  tcpsetscale(s, tcb, seg.ws, tcb->scale);
2251  }
2252  else {
2253  tcb->time = NOW;
2254  tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2255  }
2256 
2257  if(length != 0 || (seg.flags & FIN))
2258  break;
2259 
2260  freeblist(bp);
2261  goto output;
2262  }
2263  else
2264  freeblist(bp);
2265 
2266  qunlock(s);
2267  poperror();
2268  return;
2269  case Syn_received:
2270  /* doesn't matter if it's the correct ack, we're just trying to set timing */
2271  if(seg.flags & ACK)
2272  tcpsynackrtt(s);
2273  break;
2274  }
2275 
2276  /*
2277  * One DOS attack is to open connections to us and then forget about them,
2278  * thereby tying up a conv at no long term cost to the attacker.
2279  * This is an attempt to defeat these stateless DOS attacks. See
2280  * corresponding code in tcpsendka().
2281  */
2282  if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2283  if(tcpporthogdefense
2284  && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2285  print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2286  source, seg.source, dest, seg.dest, seg.flags,
2287  tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2288  localclose(s, "stateless hog");
2289  }
2290  }
2291 
2292  /* Cut the data to fit the receive window */
2293  tcprcvwin(s);
2294  if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2295  if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2296  netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win %lud-%lud l %d from %I\n",
2297  seg.seq, seg.seq + length - 1,
2298  tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2299  update(s, &seg);
2300  if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2301  tcphalt(tpriv, &tcb->rtt_timer);
2302  tcphalt(tpriv, &tcb->acktimer);
2303  tcphalt(tpriv, &tcb->katimer);
2304  tcpsetstate(s, Time_wait);
2305  tcb->timer.start = MSL2*(1000 / MSPTICK);
2306  tcpgo(tpriv, &tcb->timer);
2307  }
2308  if(!(seg.flags & RST)) {
2309  tcb->flags |= FORCE;
2310  goto output;
2311  }
2312  qunlock(s);
2313  poperror();
2314  return;
2315  }
2316 
2317  /* Cannot accept so answer with a rst */
2318  if(length && tcb->state == Closed) {
2319  sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2320  goto raise;
2321  }
2322 
2323  /* The segment is beyond the current receive pointer so
2324  * queue the data in the resequence queue
2325  */
2326  if(seg.seq != tcb->rcv.nxt)
2327  if(length != 0 || (seg.flags & (SYN|FIN))) {
2328  update(s, &seg);
2329  if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2330  print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2331  tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */
2332  goto output;
2333  }
2334 
2335  if(tcb->nreseq > 0)
2336  tcb->flags |= FORCE; /* filled hole in sequence space; RFC 5681 §3.2 */
2337 
2338  /*
2339  * keep looping till we've processed this packet plus any
2340  * adjacent packets in the resequence queue
2341  */
2342  for(;;) {
2343  if(seg.flags & RST) {
2344  if(tcb->state == Established) {
2345  tpriv->stats[EstabResets]++;
2346  if(tcb->rcv.nxt != seg.seq)
2347  print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2348  }
2349  localclose(s, Econrefused);
2350  goto raise;
2351  }
2352 
2353  if((seg.flags&ACK) == 0)
2354  goto raise;
2355 
2356  switch(tcb->state) {
2357  case Syn_received:
2358  if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2359  sndrst(tcp, source, dest, length, &seg, version,
2360  "bad seq in Syn_received");
2361  goto raise;
2362  }
2363  update(s, &seg);
2364  tcpsetstate(s, Established);
2365  case Established:
2366  case Close_wait:
2367  update(s, &seg);
2368  break;
2369  case Finwait1:
2370  update(s, &seg);
2371  if(qlen(s->wq)+tcb->flgcnt == 0){
2372  tcphalt(tpriv, &tcb->rtt_timer);
2373  tcphalt(tpriv, &tcb->acktimer);
2374  tcpsetkacounter(tcb);
2375  tcb->time = NOW;
2376  tcpsetstate(s, Finwait2);
2377  tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2378  tcpgo(tpriv, &tcb->katimer);
2379  }
2380  break;
2381  case Finwait2:
2382  update(s, &seg);
2383  break;
2384  case Closing:
2385  update(s, &seg);
2386  if(qlen(s->wq)+tcb->flgcnt == 0) {
2387  tcphalt(tpriv, &tcb->rtt_timer);
2388  tcphalt(tpriv, &tcb->acktimer);
2389  tcphalt(tpriv, &tcb->katimer);
2390  tcpsetstate(s, Time_wait);
2391  tcb->timer.start = MSL2*(1000 / MSPTICK);
2392  tcpgo(tpriv, &tcb->timer);
2393  }
2394  break;
2395  case Last_ack:
2396  update(s, &seg);
2397  if(qlen(s->wq)+tcb->flgcnt == 0) {
2398  localclose(s, nil);
2399  goto raise;
2400  }
2401  case Time_wait:
2402  if(seg.flags & FIN)
2403  tcb->flags |= FORCE;
2404  if(tcb->timer.state != TcptimerON)
2405  tcpgo(tpriv, &tcb->timer);
2406  }
2407 
2408  if((seg.flags&URG) && seg.urg) {
2409  if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2410  tcb->rcv.urg = seg.urg + seg.seq;
2411  pullblock(&bp, seg.urg);
2412  }
2413  }
2414  else
2415  if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2416  tcb->rcv.urg = tcb->rcv.nxt;
2417 
2418  if(length == 0) {
2419  if(bp != nil)
2420  freeblist(bp);
2421  }
2422  else {
2423  switch(tcb->state){
2424  default:
2425  /* Ignore segment text */
2426  if(bp != nil)
2427  freeblist(bp);
2428  break;
2429 
2430  case Syn_received:
2431  case Established:
2432  case Finwait1:
2433  /* If we still have some data place on
2434  * receive queue
2435  */
2436  if(bp) {
2437  qpassnolim(s->rq, packblock(bp));
2438  bp = nil;
2439  }
2440  tcb->rcv.nxt += length;
2441 
2442  /*
2443  * turn on the acktimer if there's something
2444  * to ack
2445  */
2446  if(tcb->acktimer.state != TcptimerON)
2447  tcpgo(tpriv, &tcb->acktimer);
2448 
2449  break;
2450  case Finwait2:
2451  /* no process to read the data, send a reset */
2452  if(bp != nil)
2453  freeblist(bp);
2454  sndrst(tcp, source, dest, length, &seg, version,
2455  "send to Finwait2");
2456  qunlock(s);
2457  poperror();
2458  return;
2459  }
2460  }
2461 
2462  if(seg.flags & FIN) {
2463  tcb->flags |= FORCE;
2464 
2465  switch(tcb->state) {
2466  case Syn_received:
2467  case Established:
2468  tcb->rcv.nxt++;
2469  tcpsetstate(s, Close_wait);
2470  break;
2471  case Finwait1:
2472  tcb->rcv.nxt++;
2473  if(qlen(s->wq)+tcb->flgcnt == 0) {
2474  tcphalt(tpriv, &tcb->rtt_timer);
2475  tcphalt(tpriv, &tcb->acktimer);
2476  tcphalt(tpriv, &tcb->katimer);
2477  tcpsetstate(s, Time_wait);
2478  tcb->timer.start = MSL2*(1000/MSPTICK);
2479  tcpgo(tpriv, &tcb->timer);
2480  }
2481  else
2482  tcpsetstate(s, Closing);
2483  break;
2484  case Finwait2:
2485  tcb->rcv.nxt++;
2486  tcphalt(tpriv, &tcb->rtt_timer);
2487  tcphalt(tpriv, &tcb->acktimer);
2488  tcphalt(tpriv, &tcb->katimer);
2489  tcpsetstate(s, Time_wait);
2490  tcb->timer.start = MSL2 * (1000/MSPTICK);
2491  tcpgo(tpriv, &tcb->timer);
2492  break;
2493  case Close_wait:
2494  case Closing:
2495  case Last_ack:
2496  break;
2497  case Time_wait:
2498  tcpgo(tpriv, &tcb->timer);
2499  break;
2500  }
2501  }
2502 
2503  /*
2504  * get next adjacent segment from the resequence queue.
2505  * dump/trim any overlapping segments
2506  */
2507  for(;;) {
2508  if(tcb->reseq == nil)
2509  goto output;
2510 
2511  if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2512  goto output;
2513 
2514  getreseq(tcb, &seg, &bp, &length);
2515 
2516  tcprcvwin(s);
2517  if(tcptrim(tcb, &seg, &bp, &length) == 0){
2518  tcb->flags |= FORCE;
2519  break;
2520  }
2521  }
2522  }
2523 output:
2524  tcpoutput(s);
2525  qunlock(s);
2526  poperror();
2527  return;
2528 raise:
2529  qunlock(s);
2530  poperror();
2531  freeblist(bp);
2532  tcpkick(s);
2533 }
2534 
2535 /*
2536  * always enters and exits with the s locked. We drop
2537  * the lock to ipoput the packet so some care has to be
2538  * taken by callers.
2539  */
2540 static void
2541 tcpoutput(Conv *s)
2542 {
2543  Tcp seg;
2544  uint msgs;
2545  Tcpctl *tcb;
2546  Block *hbp, *bp;
2547  int sndcnt;
2548  ulong ssize, dsize, sent;
2549  Fs *f;
2550  Tcppriv *tpriv;
2551  uchar version;
2552 
2553  f = s->p->f;
2554  tpriv = s->p->priv;
2555  version = s->ipversion;
2556 
2557  tcb = (Tcpctl*)s->ptcl;
2558 
2559  /* force ack every 2*mss */
2560  if((tcb->flags & FORCE) == 0)
2561  if(tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2562  tpriv->stats[Delayack]++;
2563  tcb->flags |= FORCE;
2564  }
2565 
2566  /* force ack if window opening */
2567  if(0)
2568  if((tcb->flags & FORCE) == 0){
2569  tcprcvwin(s);
2570  if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2571  tpriv->stats[Wopenack]++;
2572  tcb->flags |= FORCE;
2573  }
2574  }
2575 
2576  for(msgs = 0; msgs < 100; msgs++) {
2577  switch(tcb->state) {
2578  case Listen:
2579  case Closed:
2580  case Finwait2:
2581  return;
2582  }
2583 
2584  /* Don't send anything else until our SYN has been acked */
2585  if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2586  break;
2587 
2588  /* force an ack when a window has opened up */
2589  tcprcvwin(s);
2590  if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2591  tcb->rcv.blocked = 0;
2592  tcb->flags |= FORCE;
2593  }
2594 
2595  sndcnt = qlen(s->wq)+tcb->flgcnt;
2596  sent = tcb->snd.ptr - tcb->snd.una;
2597  ssize = sndcnt;
2598  if(tcb->snd.wnd == 0){
2599  /* zero window probe */
2600  if(sent > 0)
2601  if(!(tcb->flags & FORCE))
2602  break; /* already probing, rto re-probes */
2603  if(ssize < sent)
2604  ssize = 0;
2605  else{
2606  ssize -= sent;
2607  if(ssize > 0)
2608  ssize = 1;
2609  }
2610  } else {
2611  /* calculate usable segment size */
2612  if(ssize > tcb->cwind)
2613  ssize = tcb->cwind;
2614  if(ssize > tcb->snd.wnd)
2615  ssize = tcb->snd.wnd;
2616 
2617  if(ssize < sent)
2618  ssize = 0;
2619  else {
2620  ssize -= sent;
2621  if(ssize > tcb->mss)
2622  ssize = tcb->mss;
2623  }
2624  }
2625 
2626  dsize = ssize;
2627  seg.urg = 0;
2628 
2629  if(!(tcb->flags & FORCE)){
2630  if(ssize == 0)
2631  break;
2632  if(ssize < tcb->mss)
2633  if(tcb->snd.nxt == tcb->snd.ptr)
2634  if(sent > TCPREXMTTHRESH*tcb->mss)
2635  break;
2636  }
2637 
2638  tcb->flags &= ~FORCE;
2639 
2640  /* By default we will generate an ack */
2641  tcphalt(tpriv, &tcb->acktimer);
2642  seg.source = s->lport;
2643  seg.dest = s->rport;
2644  seg.flags = ACK;
2645  seg.mss = 0;
2646  seg.ws = 0;
2647  seg.update = 0;
2648  switch(tcb->state){
2649  case Syn_sent:
2650  seg.flags = 0;
2651  if(tcb->snd.ptr == tcb->iss){
2652  seg.flags |= SYN;
2653  dsize--;
2654  seg.mss = tcb->mss;
2655  seg.ws = tcb->scale;
2656  }
2657  break;
2658  case Syn_received:
2659  /*
2660  * don't send any data with a SYN/ACK packet
2661  * because Linux rejects the packet in its
2662  * attempt to solve the SYN attack problem
2663  */
2664  if(tcb->snd.ptr == tcb->iss){
2665  seg.flags |= SYN;
2666  dsize = 0;
2667  ssize = 1;
2668  seg.mss = tcb->mss;
2669  seg.ws = tcb->scale;
2670  }
2671  break;
2672  }
2673  seg.seq = tcb->snd.ptr;
2674  seg.ack = tcb->rcv.nxt;
2675  seg.wnd = tcb->rcv.wnd;
2676 
2677  /* Pull out data to send */
2678  bp = nil;
2679  if(dsize != 0) {
2680  bp = qcopy(s->wq, dsize, sent);
2681  if(BLEN(bp) != dsize) {
2682  seg.flags |= FIN;
2683  dsize--;
2684  }
2685  }
2686 
2687  if(sent+dsize == sndcnt && dsize)
2688  seg.flags |= PSH;
2689 
2690  tcb->snd.ptr += ssize;
2691 
2692  /* Pull up the send pointer so we can accept acks
2693  * for this window
2694  */
2695  if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2696  tcb->snd.nxt = tcb->snd.ptr;
2697 
2698  /* Build header, link data and compute cksum */
2699  switch(version){
2700  case V4:
2701  tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2702  hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2703  if(hbp == nil) {
2704  freeblist(bp);
2705  return;
2706  }
2707  break;
2708  case V6:
2709  tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2710  hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2711  if(hbp == nil) {
2712  freeblist(bp);
2713  return;
2714  }
2715  break;
2716  default:
2717  hbp = nil; /* to suppress a warning */
2718  panic("tcpoutput: version %d", version);
2719  }
2720 
2721  /* Start the transmission timers if there is new data and we
2722  * expect acknowledges
2723  */
2724  if(ssize != 0){
2725  if(tcb->timer.state != TcptimerON){
2726  tcb->time = NOW;
2727  tcb->timeuna = tcb->snd.una;
2728  tcpgo(tpriv, &tcb->timer);
2729  }
2730 
2731  /* If round trip timer isn't running, start it.
2732  * measure the longest packet only in case the
2733  * transmission time dominates RTT
2734  */
2735  if(tcb->snd.retransmit == 0)
2736  if(tcb->rtt_timer.state != TcptimerON)
2737  if(ssize == tcb->mss) {
2738  tcpgo(tpriv, &tcb->rtt_timer);
2739  tcb->rttseq = tcb->snd.ptr;
2740  }
2741  }
2742 
2743  tpriv->stats[OutSegs]++;
2744  if(tcb->snd.retransmit)
2745  tpriv->stats[RetransSegsSent]++;
2746  tcb->rcv.ackptr = seg.ack;
2747  tcb->rcv.wsnt = tcb->rcv.wptr;
2748 
2749  /* put off the next keep alive */
2750  tcpgo(tpriv, &tcb->katimer);
2751 
2752  switch(version){
2753  case V4:
2754  if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2755  /* a negative return means no route */
2756  localclose(s, "no route");
2757  }
2758  break;
2759  case V6:
2760  if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2761  /* a negative return means no route */
2762  localclose(s, "no route");
2763  }
2764  break;
2765  default:
2766  panic("tcpoutput2: version %d", version);
2767  }
2768  if((msgs%4) == 3){
2769  qunlock(s);
2770  qlock(s);
2771  }
2772  }
2773 }
2774 
2775 /*
2776  * the BSD convention (hack?) for keep alives. resend last uchar acked.
2777  */
2778 static void
2779 tcpsendka(Conv *s)
2780 {
2781  Tcp seg;
2782  Tcpctl *tcb;
2783  Block *hbp,*dbp;
2784 
2785  tcb = (Tcpctl*)s->ptcl;
2786 
2787  dbp = nil;
2788  memset(&seg, 0, sizeof seg);
2789  seg.urg = 0;
2790  seg.source = s->lport;
2791  seg.dest = s->rport;
2792  seg.flags = ACK|PSH;
2793  seg.mss = 0;
2794  seg.ws = 0;
2795  if(tcpporthogdefense)
2796  seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2797  else
2798  seg.seq = tcb->snd.una-1;
2799  seg.ack = tcb->rcv.nxt;
2800  tcb->rcv.ackptr = seg.ack;
2801  tcprcvwin(s);
2802  seg.wnd = tcb->rcv.wnd;
2803  if(tcb->state == Finwait2){
2804  seg.flags |= FIN;
2805  } else {
2806  dbp = allocb(1);
2807  dbp->wp++;
2808  }
2809 
2810  if(isv4(s->raddr)) {
2811  /* Build header, link data and compute cksum */
2812  tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2813  hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2814  if(hbp == nil) {
2815  freeblist(dbp);
2816  return;
2817  }
2818  ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2819  }
2820  else {
2821  /* Build header, link data and compute cksum */
2822  tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2823  hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2824  if(hbp == nil) {
2825  freeblist(dbp);
2826  return;
2827  }
2828  ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2829  }
2830 }
2831 
2832 /*
2833  * set connection to time out after 12 minutes
2834  */
2835 static void
2836 tcpsetkacounter(Tcpctl *tcb)
2837 {
2838  tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2839  if(tcb->kacounter < 3)
2840  tcb->kacounter = 3;
2841 }
2842 
2843 /*
2844  * if we've timed out, close the connection
2845  * otherwise, send a keepalive and restart the timer
2846  */
2847 static void
2848 tcpkeepalive(void *v)
2849 {
2850  Tcpctl *tcb;
2851  Conv *s;
2852 
2853  s = v;
2854  tcb = (Tcpctl*)s->ptcl;
2855  if(waserror()){
2856  qunlock(s);
2857  nexterror();
2858  }
2859  qlock(s);
2860  if(tcb->state != Closed){
2861  if(--(tcb->kacounter) <= 0) {
2862  localclose(s, Etimedout);
2863  } else {
2864  tcpsendka(s);
2865  tcpgo(s->p->priv, &tcb->katimer);
2866  }
2867  }
2868  qunlock(s);
2869  poperror();
2870 }
2871 
2872 /*
2873  * start keepalive timer
2874  */
2875 static char*
2876 tcpstartka(Conv *s, char **f, int n)
2877 {
2878  Tcpctl *tcb;
2879  int x;
2880 
2881  tcb = (Tcpctl*)s->ptcl;
2882  if(tcb->state != Established)
2883  return "connection must be in Establised state";
2884  if(n > 1){
2885  x = atoi(f[1]);
2886  if(x >= MSPTICK)
2887  tcb->katimer.start = x/MSPTICK;
2888  }
2889  tcpsetkacounter(tcb);
2890  tcpgo(s->p->priv, &tcb->katimer);
2891 
2892  return nil;
2893 }
2894 
2895 /*
2896  * turn checksums on/off
2897  */
2898 static char*
2899 tcpsetchecksum(Conv *s, char **f, int)
2900 {
2901  Tcpctl *tcb;
2902 
2903  tcb = (Tcpctl*)s->ptcl;
2904  tcb->nochecksum = !atoi(f[1]);
2905 
2906  return nil;
2907 }
2908 
2909 /*
2910  * retransmit (at most) one segment at snd.una.
2911  * preserve cwind & snd.ptr
2912  */
2913 static void
2914 tcprxmit(Conv *s)
2915 {
2916  Tcpctl *tcb;
2917  Tcppriv *tpriv;
2918  ulong tcwind, tptr;
2919 
2920  tcb = (Tcpctl*)s->ptcl;
2921  tcb->flags |= RETRAN|FORCE;
2922 
2923  tptr = tcb->snd.ptr;
2924  tcwind = tcb->cwind;
2925  tcb->snd.ptr = tcb->snd.una;
2926  tcb->cwind = tcb->mss;
2927  tcb->snd.retransmit = 1;
2928  tcpoutput(s);
2929  tcb->snd.retransmit = 0;
2930  tcb->cwind = tcwind;
2931  tcb->snd.ptr = tptr;
2932 
2933  tpriv = s->p->priv;
2934  tpriv->stats[RetransSegs]++;
2935 }
2936 
2937 /*
2938  * todo: RFC 4138 F-RTO
2939  */
2940 static void
2941 tcptimeout(void *arg)
2942 {
2943  Conv *s;
2944  Tcpctl *tcb;
2945  int maxback;
2946  Tcppriv *tpriv;
2947 
2948  s = (Conv*)arg;
2949  tpriv = s->p->priv;
2950  tcb = (Tcpctl*)s->ptcl;
2951 
2952  if(waserror()){
2953  qunlock(s);
2954  nexterror();
2955  }
2956  qlock(s);
2957  switch(tcb->state){
2958  default:
2959  tcb->backoff++;
2960  if(tcb->state == Syn_sent)
2961  maxback = MAXBACKMS/2;
2962  else
2963  maxback = MAXBACKMS;
2964  tcb->backedoff += tcb->timer.start * MSPTICK;
2965  if(tcb->backedoff >= maxback) {
2966  localclose(s, Etimedout);
2967  break;
2968  }
2969  netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
2970  tcb->srtt, tcb->mdev, NOW-tcb->time,
2971  tcb->snd.una-tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
2972  tcpstates[s->state]);
2973  tcpsettimer(tcb);
2974  if(tcb->snd.rto == 0)
2975  tcpcongestion(tcb);
2976  tcprxmit(s);
2977  tcb->snd.ptr = tcb->snd.una;
2978  tcb->cwind = tcb->mss;
2979  tcb->snd.rto = 1;
2980  tpriv->stats[RetransTimeouts]++;
2981 
2982  if(tcb->snd.recovery){
2983  tcb->snd.dupacks = 0; /* reno rto */
2984  tcb->snd.recovery = 0;
2985  tpriv->stats[RecoveryRTO]++;
2986  tcb->snd.rxt = tcb->snd.nxt;
2987  netlog(s->p->f, Logtcpwin,
2988  "rto recovery rxt @%lud\n", tcb->snd.nxt);
2989  }
2990 
2991  tcb->abcbytes = 0;
2992  break;
2993  case Time_wait:
2994  localclose(s, nil);
2995  break;
2996  case Closed:
2997  break;
2998  }
2999  qunlock(s);
3000  poperror();
3001 }
3002 
3003 static int
3004 inwindow(Tcpctl *tcb, int seq)
3005 {
3006  return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3007 }
3008 
3009 /*
3010  * set up state for a received SYN (or SYN ACK) packet
3011  */
3012 static void
3013 procsyn(Conv *s, Tcp *seg)
3014 {
3015  Tcpctl *tcb;
3016  Tcppriv *tpriv;
3017 
3018  tcb = (Tcpctl*)s->ptcl;
3019  tcb->flags |= FORCE;
3020 
3021  tcb->rcv.nxt = seg->seq + 1;
3022  tcb->rcv.wptr = tcb->rcv.nxt;
3023  tcb->rcv.wsnt = 0;
3024  tcb->rcv.urg = tcb->rcv.nxt;
3025  tcb->irs = seg->seq;
3026 
3027  /* our sending max segment size cannot be bigger than what he asked for */
3028  if(seg->mss != 0 && seg->mss < tcb->mss) {
3029  tcb->mss = seg->mss;
3030  tpriv = s->p->priv;
3031  tpriv->stats[Mss] = tcb->mss;
3032  }
3033 
3034  /* if the server does not support ws option, disable window scaling */
3035  if(seg->ws == 0){
3036  tcb->scale = 0;
3037  tcb->snd.scale = 0;
3038  }
3039 
3040  tcb->snd.wnd = seg->wnd;
3041  initialwindow(tcb);
3042 }
3043 
3044 static int
3045 dumpreseq(Tcpctl *tcb)
3046 {
3047  Reseq *r, *next;
3048 
3049  for(r = tcb->reseq; r != nil; r = next){
3050  next = r->next;
3051  freeblist(r->bp);
3052  free(r);
3053  }
3054  tcb->reseq = nil;
3055  tcb->nreseq = 0;
3056  tcb->reseqlen = 0;
3057  return -1;
3058 }
3059 
3060 static void
3061 logreseq(Fs *f, Reseq *r, ulong n)
3062 {
3063  char *s;
3064 
3065  for(; r != nil; r = r->next){
3066  s = nil;
3067  if(r->next == nil && r->seg.seq != n)
3068  s = "hole/end";
3069  else if(r->next == nil)
3070  s = "end";
3071  else if(r->seg.seq != n)
3072  s = "hole";
3073  if(s != nil)
3074  netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3075  n, r->seg.seq, r->seg.seq-n, r->seg.flags);
3076  n = r->seg.seq + r->seg.len;
3077  }
3078 }
3079 
3080 static int
3081 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3082 {
3083  Reseq *rp, **rr;
3084  int qmax;
3085 
3086  rp = malloc(sizeof(Reseq));
3087  if(rp == nil){
3088  freeblist(bp); /* bp always consumed by addreseq */
3089  return 0;
3090  }
3091 
3092  rp->seg = *seg;
3093  rp->bp = bp;
3094  rp->length = length;
3095 
3096  tcb->reseqlen += length;
3097  tcb->nreseq++;
3098 
3099  /* Place on reassembly list sorting by starting seq number */
3100  for(rr = &tcb->reseq;; rr = &(*rr)->next)
3101  if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3102  rp->next = *rr;
3103  *rr = rp;
3104  tpriv->stats[Resequenced]++;
3105  if(rp->next != nil)
3106  tpriv->stats[OutOfOrder]++;
3107  break;
3108  }
3109 
3110  qmax = tcb->window;
3111  if(tcb->reseqlen > qmax){
3112  netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq);
3113  logreseq(f, tcb->reseq, tcb->rcv.nxt);
3114  tpriv->stats[ReseqBytelim]++;
3115  return dumpreseq(tcb);
3116  }
3117  qmax = tcb->window / tcb->mss; /* ~190 for qscale==2, 390 for qscale=3 */
3118  if(tcb->nreseq > qmax){
3119  netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen);
3120  logreseq(f, tcb->reseq, tcb->rcv.nxt);
3121  tpriv->stats[ReseqPktlim]++;
3122  return dumpreseq(tcb);
3123  }
3124 
3125  return 0;
3126 }
3127 
3128 static void
3129 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3130 {
3131  Reseq *rp;
3132 
3133  rp = tcb->reseq;
3134  if(rp == nil)
3135  return;
3136 
3137  tcb->reseq = rp->next;
3138 
3139  *seg = rp->seg;
3140  *bp = rp->bp;
3141  *length = rp->length;
3142 
3143  tcb->nreseq--;
3144  tcb->reseqlen -= rp->length;
3145 
3146  free(rp);
3147 }
3148 
3149 static int
3150 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3151 {
3152  ushort len;
3153  uchar accept;
3154  int dupcnt, excess;
3155 
3156  accept = 0;
3157  len = *length;
3158  if(seg->flags & SYN)
3159  len++;
3160  if(seg->flags & FIN)
3161  len++;
3162 
3163  if(tcb->rcv.wnd == 0) {
3164  if(len == 0 && seg->seq == tcb->rcv.nxt)
3165  return 0;
3166  }
3167  else {
3168  /* Some part of the segment should be in the window */
3169  if(inwindow(tcb,seg->seq))
3170  accept++;
3171  else
3172  if(len != 0) {
3173  if(inwindow(tcb, seg->seq+len-1) ||
3174  seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3175  accept++;
3176  }
3177  }
3178  if(!accept) {
3179  freeblist(*bp);
3180  return -1;
3181  }
3182  dupcnt = tcb->rcv.nxt - seg->seq;
3183  if(dupcnt > 0){
3184  tcb->rerecv += dupcnt;
3185  if(seg->flags & SYN){
3186  seg->flags &= ~SYN;
3187  seg->seq++;
3188 
3189  if(seg->urg > 1)
3190  seg->urg--;
3191  else
3192  seg->flags &= ~URG;
3193  dupcnt--;
3194  }
3195  if(dupcnt > 0){
3196  pullblock(bp, (ushort)dupcnt);
3197  seg->seq += dupcnt;
3198  *length -= dupcnt;
3199 
3200  if(seg->urg > dupcnt)
3201  seg->urg -= dupcnt;
3202  else {
3203  seg->flags &= ~URG;
3204  seg->urg = 0;
3205  }
3206  }
3207  }
3208  excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3209  if(excess > 0) {
3210  tcb->rerecv += excess;
3211  *length -= excess;
3212  *bp = trimblock(*bp, 0, *length);
3213  if(*bp == nil)
3214  panic("presotto is a boofhead");
3215  seg->flags &= ~FIN;
3216  }
3217  return 0;
3218 }
3219 
3220 static void
3221 tcpadvise(Proto *tcp, Block *bp, char *msg)
3222 {
3223  Tcp4hdr *h4;
3224  Tcp6hdr *h6;
3225  Tcpctl *tcb;
3226  uchar source[IPaddrlen];
3227  uchar dest[IPaddrlen];
3228  ushort psource, pdest;
3229  Conv *s, **p;
3230 
3231  h4 = (Tcp4hdr*)(bp->rp);
3232  h6 = (Tcp6hdr*)(bp->rp);
3233 
3234  if((h4->vihl&0xF0)==IP_VER4) {
3235  v4tov6(dest, h4->tcpdst);
3236  v4tov6(source, h4->tcpsrc);
3237  psource = nhgets(h4->tcpsport);
3238  pdest = nhgets(h4->tcpdport);
3239  } else {
3240  ipmove(dest, h6->tcpdst);
3241  ipmove(source, h6->tcpsrc);
3242  psource = nhgets(h6->tcpsport);
3243  pdest = nhgets(h6->tcpdport);
3244  }
3245 
3246  /* Look for a connection */
3247  qlock(tcp);
3248  for(p = tcp->conv; (s = *p) != nil; p++) {
3249  tcb = (Tcpctl*)s->ptcl;
3250  if(s->rport == pdest)
3251  if(s->lport == psource)
3252  if(tcb->state != Closed)
3253  if(ipcmp(s->raddr, dest) == 0)
3254  if(ipcmp(s->laddr, source) == 0){
3255  if(s->ignoreadvice)
3256  break;
3257  qlock(s);
3258  qunlock(tcp);
3259  switch(tcb->state){
3260  case Syn_sent:
3261  localclose(s, msg);
3262  break;
3263  }
3264  qunlock(s);
3265  freeblist(bp);
3266  return;
3267  }
3268  }
3269  qunlock(tcp);
3270  freeblist(bp);
3271 }
3272 
3273 static char*
3274 tcpporthogdefensectl(char *val)
3275 {
3276  if(strcmp(val, "on") == 0)
3277  tcpporthogdefense = 1;
3278  else if(strcmp(val, "off") == 0)
3279  tcpporthogdefense = 0;
3280  else
3281  return "unknown value for tcpporthogdefense";
3282  return nil;
3283 }
3284 
3285 /* called with c qlocked */
3286 static char*
3287 tcpctl(Conv* c, char** f, int n)
3288 {
3289  if(n == 1 && strcmp(f[0], "close") == 0)
3290  return tcpclose(c), nil;
3291  if(n == 1 && strcmp(f[0], "hangup") == 0)
3292  return tcphangup(c);
3293  if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3294  return tcpstartka(c, f, n);
3295  if(n >= 1 && strcmp(f[0], "checksum") == 0)
3296  return tcpsetchecksum(c, f, n);
3297  if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3298  return tcpporthogdefensectl(f[1]);
3299  return "unknown control request";
3300 }
3301 
3302 static int
3303 tcpstats(Proto *tcp, char *buf, int len)
3304 {
3305  Tcppriv *priv;
3306  char *p, *e;
3307  int i;
3308 
3309  priv = tcp->priv;
3310  p = buf;
3311  e = p+len;
3312  for(i = 0; i < Nstats; i++)
3313  p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3314  return p - buf;
3315 }
3316 
3317 /*
3318  * garbage collect any stale conversations:
3319  * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3320  * - Finwait2 after 5 minutes
3321  *
3322  * this is called whenever we run out of channels. Both checks are
3323  * of questionable validity so we try to use them only when we're
3324  * up against the wall.
3325  */
3326 static int
3327 tcpgc(Proto *tcp)
3328 {
3329  Conv *c, **pp, **ep;
3330  int n;
3331  Tcpctl *tcb;
3332 
3333 
3334  n = 0;
3335  ep = &tcp->conv[tcp->nc];
3336  for(pp = tcp->conv; pp < ep; pp++) {
3337  c = *pp;
3338  if(c == nil)
3339  break;
3340  if(!canqlock(c))
3341  continue;
3342  tcb = (Tcpctl*)c->ptcl;
3343  switch(tcb->state){
3344  case Syn_received:
3345  if(NOW - tcb->time > 5000){
3346  localclose(c, Etimedout);
3347  n++;
3348  }
3349  break;
3350  case Finwait2:
3351  if(NOW - tcb->time > 5*60*1000){
3352  localclose(c, Etimedout);
3353  n++;
3354  }
3355  break;
3356  }
3357  qunlock(c);
3358  }
3359  return n;
3360 }
3361 
3362 static void
3363 tcpsettimer(Tcpctl *tcb)
3364 {
3365  int x;
3366 
3367  /* round trip dependency */
3368  x = backoff(tcb->backoff) *
3369  (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3370 
3371  /* bounded twixt 0.3 and 64 seconds */
3372  if(x < 300/MSPTICK)
3373  x = 300/MSPTICK;
3374  else if(x > (64000/MSPTICK))
3375  x = 64000/MSPTICK;
3376  tcb->timer.start = x;
3377 }
3378 
3379 void
3380 tcpinit(Fs *fs)
3381 {
3382  Proto *tcp;
3383  Tcppriv *tpriv;
3384 
3385  tcp = smalloc(sizeof(Proto));
3386  tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3387  tcp->name = "tcp";
3388  tcp->connect = tcpconnect;
3389  tcp->announce = tcpannounce;
3390  tcp->ctl = tcpctl;
3391  tcp->state = tcpstate;
3392  tcp->create = tcpcreate;
3393  tcp->close = tcpclose;
3394  tcp->rcv = tcpiput;
3395  tcp->advise = tcpadvise;
3396  tcp->stats = tcpstats;
3397  tcp->inuse = tcpinuse;
3398  tcp->gc = tcpgc;
3399  tcp->ipproto = IP_TCPPROTO;
3400  tcp->nc = scalednconv();
3401  tcp->ptclsize = sizeof(Tcpctl);
3402  tpriv->stats[MaxConn] = tcp->nc;
3403 
3404  Fsproto(fs, tcp);
3405 }
3406 
3407 static void
3408 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3409 {
3410  /*
3411  * guess at reasonable queue sizes. there's no current way
3412  * to know how many nic receive buffers we can safely tie up in the
3413  * tcp stack, and we don't adjust our queues to maximize throughput
3414  * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be
3415  * respected, but we still control our own buffer commitment by
3416  * keeping a seperate qscale.
3417  */
3418  tcb->rcv.scale = rcvscale & 0xff;
3419  tcb->snd.scale = sndscale & 0xff;
3420  tcb->qscale = rcvscale & 0xff;
3421  if(rcvscale > Maxqscale)
3422  tcb->qscale = Maxqscale;
3423 
3424  if(rcvscale != tcb->rcv.scale)
3425  netlog(s->p->f, Logtcp, "tcpsetscale: window %lud qlen %d >> window %ud lport %d\n",
3426  tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3427  tcb->window = QMAX<<tcb->qscale;
3428  tcb->ssthresh = tcb->window;
3429 
3430  /*
3431  * it's important to set wq large enough to cover the full
3432  * bandwidth-delay product. it's possible to be in loss
3433  * recovery with a big window, and we need to keep sending
3434  * into the inflated window. the difference can be huge
3435  * for even modest (70ms) ping times.
3436  */
3437  qsetlimit(s->rq, QMAX<<tcb->qscale);
3438  qsetlimit(s->wq, QMAX<<tcb->qscale);
3439  tcprcvwin(s);
3440 }