1: /*
2: * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3: * The Regents of the University of California. All rights reserved.
4: *
5: * Redistribution and use in source and binary forms, with or without
6: * modification, are permitted provided that the following conditions
7: * are met:
8: * 1. Redistributions of source code must retain the above copyright
9: * notice, this list of conditions and the following disclaimer.
10: * 2. Redistributions in binary form must reproduce the above copyright
11: * notice, this list of conditions and the following disclaimer in the
12: * documentation and/or other materials provided with the distribution.
13: * 3. All advertising materials mentioning features or use of this software
14: * must display the following acknowledgement:
15: * This product includes software developed by the University of
16: * California, Berkeley and its contributors.
17: * 4. Neither the name of the University nor the names of its contributors
18: * may be used to endorse or promote products derived from this software
19: * without specific prior written permission.
20: *
21: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31: * SUCH DAMAGE.
32: *
33: * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
34: * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.31 2003/01/24 05:11:34 sam Exp $
35: * $DragonFly: src/sys/netinet/tcp_subr.c,v 1.22 2004/04/10 00:07:15 hsu Exp $
36: */
37:
38: #include "opt_compat.h"
39: #include "opt_inet6.h"
40: #include "opt_ipsec.h"
41: #include "opt_tcpdebug.h"
42: #include "opt_tcp_input.h"
43:
44: #include <sys/param.h>
45: #include <sys/systm.h>
46: #include <sys/callout.h>
47: #include <sys/kernel.h>
48: #include <sys/sysctl.h>
49: #include <sys/malloc.h>
50: #include <sys/mbuf.h>
51: #ifdef INET6
52: #include <sys/domain.h>
53: #endif
54: #include <sys/proc.h>
55: #include <sys/socket.h>
56: #include <sys/socketvar.h>
57: #include <sys/protosw.h>
58: #include <sys/random.h>
59: #include <sys/in_cksum.h>
60:
61: #include <vm/vm_zone.h>
62:
63: #include <net/route.h>
64: #include <net/if.h>
65: #include <net/netisr.h>
66:
67: #define _IP_VHL
68: #include <netinet/in.h>
69: #include <netinet/in_systm.h>
70: #include <netinet/ip.h>
71: #ifdef INET6
72: #include <netinet/ip6.h>
73: #endif
74: #include <netinet/in_pcb.h>
75: #ifdef INET6
76: #include <netinet6/in6_pcb.h>
77: #endif
78: #include <netinet/in_var.h>
79: #include <netinet/ip_var.h>
80: #ifdef INET6
81: #include <netinet6/ip6_var.h>
82: #endif
83: #include <netinet/tcp.h>
84: #include <netinet/tcp_fsm.h>
85: #include <netinet/tcp_seq.h>
86: #include <netinet/tcp_timer.h>
87: #include <netinet/tcp_var.h>
88: #ifdef INET6
89: #include <netinet6/tcp6_var.h>
90: #endif
91: #include <netinet/tcpip.h>
92: #ifdef TCPDEBUG
93: #include <netinet/tcp_debug.h>
94: #endif
95: #include <netinet6/ip6protosw.h>
96:
97: #ifdef IPSEC
98: #include <netinet6/ipsec.h>
99: #ifdef INET6
100: #include <netinet6/ipsec6.h>
101: #endif
102: #endif /*IPSEC*/
103:
104: #ifdef FAST_IPSEC
105: #include <netipsec/ipsec.h>
106: #ifdef INET6
107: #include <netipsec/ipsec6.h>
108: #endif
109: #define IPSEC
110: #endif /*FAST_IPSEC*/
111:
112: #include <sys/md5.h>
113:
114: #include <sys/msgport2.h>
115:
116: int tcp_mssdflt = TCP_MSS;
117: SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
118: &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
119:
120: #ifdef INET6
121: int tcp_v6mssdflt = TCP6_MSS;
122: SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
123: CTLFLAG_RW, &tcp_v6mssdflt , 0,
124: "Default TCP Maximum Segment Size for IPv6");
125: #endif
126:
127: #if 0
128: static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
129: SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
130: &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
131: #endif
132:
133: int tcp_do_rfc1323 = 1;
134: SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
135: &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
136:
137: int tcp_do_rfc1644 = 0;
138: SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
139: &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
140:
141: static int tcp_tcbhashsize = 0;
142: SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
143: &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
144:
145: static int do_tcpdrain = 1;
146: SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
147: "Enable tcp_drain routine for extra help when low on mbufs");
148:
149: /* XXX JH */
150: SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
151: &tcbinfo[0].ipi_count, 0, "Number of active PCBs");
152:
153: static int icmp_may_rst = 1;
154: SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
155: "Certain ICMP unreachable messages may abort connections in SYN_SENT");
156:
157: static int tcp_isn_reseed_interval = 0;
158: SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
159: &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
160:
161: /*
162: * TCP bandwidth limiting sysctls. Note that the default lower bound of
163: * 1024 exists only for debugging. A good production default would be
164: * something like 6100.
165: */
166: static int tcp_inflight_enable = 0;
167: SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
168: &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
169:
170: static int tcp_inflight_debug = 0;
171: SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
172: &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
173:
174: static int tcp_inflight_min = 6144;
175: SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
176: &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
177:
178: static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
179: SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
180: &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
181:
182: static int tcp_inflight_stab = 20;
183: SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
184: &tcp_inflight_stab, 0, "Slop in maximal packets / 10 (20 = 2 packets)");
185:
186: static void tcp_cleartaocache (void);
187: static void tcp_notify (struct inpcb *, int);
188:
189: struct tcp_stats tcpstats_ary[MAXCPU];
190: #ifdef SMP
191: static int
192: sysctl_tcpstats(SYSCTL_HANDLER_ARGS)
193: {
194: int cpu, error;
195:
196: for (cpu = error = 0; cpu < ncpus; ++cpu) {
197: if ((error = SYSCTL_OUT(req, (void *)&tcpstats_ary[cpu],
198: sizeof(struct tcp_stats))))
199: break;
200: if ((error = SYSCTL_IN(req, (void *)&tcpstats_ary[cpu],
201: sizeof(struct tcp_stats))))
202: break;
203: }
204:
205: return (error);
206: }
207: SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLTYPE_OPAQUE|CTLFLAG_RW,
208: 0, 0, sysctl_tcpstats, "S,tcp_stats",
209: "TCP statistics (struct tcp_stats, netinet/tcp_stats.h)");
210: #else /* !SMP */
211: SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
212: &tcpstat , tcp_stats,
213: "TCP statistics (struct tcp_stats, netinet/tcp_stats.h)");
214: #endif
215:
216: /*
217: * Target size of TCP PCB hash tables. Must be a power of two.
218: *
219: * Note that this can be overridden by the kernel environment
220: * variable net.inet.tcp.tcbhashsize
221: */
222: #ifndef TCBHASHSIZE
223: #define TCBHASHSIZE 512
224: #endif
225:
226: /*
227: * This is the actual shape of what we allocate using the zone
228: * allocator. Doing it this way allows us to protect both structures
229: * using the same generation count, and also eliminates the overhead
230: * of allocating tcpcbs separately. By hiding the structure here,
231: * we avoid changing most of the rest of the code (although it needs
232: * to be changed, eventually, for greater efficiency).
233: */
234: #define ALIGNMENT 32
235: #define ALIGNM1 (ALIGNMENT - 1)
236: struct inp_tp {
237: union {
238: struct inpcb inp;
239: char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
240: } inp_tp_u;
241: struct tcpcb tcb;
242: struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
243: struct callout inp_tp_delack;
244: };
245: #undef ALIGNMENT
246: #undef ALIGNM1
247:
248: /*
249: * Tcp initialization
250: */
251: void
252: tcp_init()
253: {
254: struct inpcbporthead *porthashbase;
255: u_long porthashmask;
256: struct inpcbhead *wildcardhashbase;
257: u_long wildcardhashmask;
258: struct vm_zone *ipi_zone;
259: int hashsize = TCBHASHSIZE;
260: int cpu;
261:
262: tcp_ccgen = 1;
263: tcp_cleartaocache();
264:
265: tcp_delacktime = TCPTV_DELACK;
266: tcp_keepinit = TCPTV_KEEP_INIT;
267: tcp_keepidle = TCPTV_KEEP_IDLE;
268: tcp_keepintvl = TCPTV_KEEPINTVL;
269: tcp_maxpersistidle = TCPTV_KEEP_IDLE;
270: tcp_msl = TCPTV_MSL;
271: tcp_rexmit_min = TCPTV_MIN;
272: tcp_rexmit_slop = TCPTV_CPU_VAR;
273:
274: TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
275: if (!powerof2(hashsize)) {
276: printf("WARNING: TCB hash size not a power of 2\n");
277: hashsize = 512; /* safe default */
278: }
279: tcp_tcbhashsize = hashsize;
280: porthashbase = hashinit(hashsize, M_PCB, &porthashmask);
281: wildcardhashbase = hashinit(hashsize, M_PCB, &wildcardhashmask);
282: ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
283: ZONE_INTERRUPT, 0);
284:
285: for (cpu = 0; cpu < ncpus2; cpu++) {
286: LIST_INIT(&tcbinfo[cpu].listhead);
287: tcbinfo[cpu].hashbase = hashinit(hashsize, M_PCB,
288: &tcbinfo[cpu].hashmask);
289: tcbinfo[cpu].porthashbase = porthashbase;
290: tcbinfo[cpu].porthashmask = porthashmask;
291: tcbinfo[cpu].wildcardhashbase = wildcardhashbase;
292: tcbinfo[cpu].wildcardhashmask = wildcardhashmask;
293: tcbinfo[cpu].ipi_zone = ipi_zone;
294: }
295:
296: tcp_reass_maxseg = nmbclusters / 16;
297: TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
298: &tcp_reass_maxseg);
299:
300: #ifdef INET6
301: #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
302: #else /* INET6 */
303: #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
304: #endif /* INET6 */
305: if (max_protohdr < TCP_MINPROTOHDR)
306: max_protohdr = TCP_MINPROTOHDR;
307: if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
308: panic("tcp_init");
309: #undef TCP_MINPROTOHDR
310:
311: /*
312: * Initialize TCP statistics.
313: *
314: * It is layed out as an array which is has one element for UP,
315: * and SMP_MAXCPU elements for SMP. This allows us to retain
316: * the access mechanism from userland for both UP and SMP.
317: */
318: #ifdef SMP
319: for (cpu = 0; cpu < ncpus; ++cpu) {
320: bzero(&tcpstats_ary[cpu], sizeof(struct tcp_stats));
321: }
322: #else
323: bzero(&tcpstat, sizeof(struct tcp_stats));
324: #endif
325:
326: syncache_init();
327: tcp_thread_init();
328: }
329:
330: /*
331: * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
332: * tcp_template used to store this data in mbufs, but we now recopy it out
333: * of the tcpcb each time to conserve mbufs.
334: */
335: void
336: tcp_fillheaders(tp, ip_ptr, tcp_ptr)
337: struct tcpcb *tp;
338: void *ip_ptr;
339: void *tcp_ptr;
340: {
341: struct inpcb *inp = tp->t_inpcb;
342: struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
343:
344: #ifdef INET6
345: if ((inp->inp_vflag & INP_IPV6) != 0) {
346: struct ip6_hdr *ip6;
347:
348: ip6 = (struct ip6_hdr *)ip_ptr;
349: ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
350: (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
351: ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
352: (IPV6_VERSION & IPV6_VERSION_MASK);
353: ip6->ip6_nxt = IPPROTO_TCP;
354: ip6->ip6_plen = sizeof(struct tcphdr);
355: ip6->ip6_src = inp->in6p_laddr;
356: ip6->ip6_dst = inp->in6p_faddr;
357: tcp_hdr->th_sum = 0;
358: } else
359: #endif
360: {
361: struct ip *ip = (struct ip *) ip_ptr;
362:
363: ip->ip_vhl = IP_VHL_BORING;
364: ip->ip_tos = 0;
365: ip->ip_len = 0;
366: ip->ip_id = 0;
367: ip->ip_off = 0;
368: ip->ip_ttl = 0;
369: ip->ip_sum = 0;
370: ip->ip_p = IPPROTO_TCP;
371: ip->ip_src = inp->inp_laddr;
372: ip->ip_dst = inp->inp_faddr;
373: tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
374: htons(sizeof(struct tcphdr) + IPPROTO_TCP));
375: }
376:
377: tcp_hdr->th_sport = inp->inp_lport;
378: tcp_hdr->th_dport = inp->inp_fport;
379: tcp_hdr->th_seq = 0;
380: tcp_hdr->th_ack = 0;
381: tcp_hdr->th_x2 = 0;
382: tcp_hdr->th_off = 5;
383: tcp_hdr->th_flags = 0;
384: tcp_hdr->th_win = 0;
385: tcp_hdr->th_urp = 0;
386: }
387:
388: /*
389: * Create template to be used to send tcp packets on a connection.
390: * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
391: * use for this function is in keepalives, which use tcp_respond.
392: */
393: struct tcptemp *
394: tcp_maketemplate(tp)
395: struct tcpcb *tp;
396: {
397: struct mbuf *m;
398: struct tcptemp *n;
399:
400: m = m_get(M_DONTWAIT, MT_HEADER);
401: if (m == NULL)
402: return (0);
403: m->m_len = sizeof(struct tcptemp);
404: n = mtod(m, struct tcptemp *);
405:
406: tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
407: return (n);
408: }
409:
410: /*
411: * Send a single message to the TCP at address specified by
412: * the given TCP/IP header. If m == 0, then we make a copy
413: * of the tcpiphdr at ti and send directly to the addressed host.
414: * This is used to force keep alive messages out using the TCP
415: * template for a connection. If flags are given then we send
416: * a message back to the TCP which originated the * segment ti,
417: * and discard the mbuf containing it and any other attached mbufs.
418: *
419: * In any case the ack and sequence number of the transmitted
420: * segment are as specified by the parameters.
421: *
422: * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
423: */
424: void
425: tcp_respond(tp, ipgen, th, m, ack, seq, flags)
426: struct tcpcb *tp;
427: void *ipgen;
428: struct tcphdr *th;
429: struct mbuf *m;
430: tcp_seq ack, seq;
431: int flags;
432: {
433: int tlen;
434: int win = 0;
435: struct route *ro = 0;
436: struct route sro;
437: struct ip *ip;
438: struct tcphdr *nth;
439: #ifdef INET6
440: struct route_in6 *ro6 = 0;
441: struct route_in6 sro6;
442: struct ip6_hdr *ip6;
443: int isipv6;
444: #endif /* INET6 */
445: int ipflags = 0;
446:
447: #ifdef INET6
448: isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
449: ip6 = ipgen;
450: #endif /* INET6 */
451: ip = ipgen;
452:
453: if (tp) {
454: if (!(flags & TH_RST)) {
455: win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
456: if (win > (long)TCP_MAXWIN << tp->rcv_scale)
457: win = (long)TCP_MAXWIN << tp->rcv_scale;
458: }
459: #ifdef INET6
460: if (isipv6)
461: ro6 = &tp->t_inpcb->in6p_route;
462: else
463: #endif /* INET6 */
464: ro = &tp->t_inpcb->inp_route;
465: } else {
466: #ifdef INET6
467: if (isipv6) {
468: ro6 = &sro6;
469: bzero(ro6, sizeof *ro6);
470: } else
471: #endif /* INET6 */
472: {
473: ro = &sro;
474: bzero(ro, sizeof *ro);
475: }
476: }
477: if (m == 0) {
478: m = m_gethdr(M_DONTWAIT, MT_HEADER);
479: if (m == NULL)
480: return;
481: tlen = 0;
482: m->m_data += max_linkhdr;
483: #ifdef INET6
484: if (isipv6) {
485: bcopy((caddr_t)ip6, mtod(m, caddr_t),
486: sizeof(struct ip6_hdr));
487: ip6 = mtod(m, struct ip6_hdr *);
488: nth = (struct tcphdr *)(ip6 + 1);
489: } else
490: #endif /* INET6 */
491: {
492: bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
493: ip = mtod(m, struct ip *);
494: nth = (struct tcphdr *)(ip + 1);
495: }
496: bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
497: flags = TH_ACK;
498: } else {
499: m_freem(m->m_next);
500: m->m_next = 0;
501: m->m_data = (caddr_t)ipgen;
502: /* m_len is set later */
503: tlen = 0;
504: #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
505: #ifdef INET6
506: if (isipv6) {
507: xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
508: nth = (struct tcphdr *)(ip6 + 1);
509: } else
510: #endif /* INET6 */
511: {
512: xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
513: nth = (struct tcphdr *)(ip + 1);
514: }
515: if (th != nth) {
516: /*
517: * this is usually a case when an extension header
518: * exists between the IPv6 header and the
519: * TCP header.
520: */
521: nth->th_sport = th->th_sport;
522: nth->th_dport = th->th_dport;
523: }
524: xchg(nth->th_dport, nth->th_sport, n_short);
525: #undef xchg
526: }
527: #ifdef INET6
528: if (isipv6) {
529: ip6->ip6_flow = 0;
530: ip6->ip6_vfc = IPV6_VERSION;
531: ip6->ip6_nxt = IPPROTO_TCP;
532: ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
533: tlen));
534: tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
535: } else
536: #endif
537: {
538: tlen += sizeof (struct tcpiphdr);
539: ip->ip_len = tlen;
540: ip->ip_ttl = ip_defttl;
541: }
542: m->m_len = tlen;
543: m->m_pkthdr.len = tlen;
544: m->m_pkthdr.rcvif = (struct ifnet *) 0;
545: nth->th_seq = htonl(seq);
546: nth->th_ack = htonl(ack);
547: nth->th_x2 = 0;
548: nth->th_off = sizeof (struct tcphdr) >> 2;
549: nth->th_flags = flags;
550: if (tp)
551: nth->th_win = htons((u_short) (win >> tp->rcv_scale));
552: else
553: nth->th_win = htons((u_short)win);
554: nth->th_urp = 0;
555: #ifdef INET6
556: if (isipv6) {
557: nth->th_sum = 0;
558: nth->th_sum = in6_cksum(m, IPPROTO_TCP,
559: sizeof(struct ip6_hdr),
560: tlen - sizeof(struct ip6_hdr));
561: ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
562: ro6 && ro6->ro_rt ?
563: ro6->ro_rt->rt_ifp :
564: NULL);
565: } else
566: #endif /* INET6 */
567: {
568: nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
569: htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
570: m->m_pkthdr.csum_flags = CSUM_TCP;
571: m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
572: }
573: #ifdef TCPDEBUG
574: if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
575: tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
576: #endif
577: #ifdef INET6
578: if (isipv6) {
579: (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
580: tp ? tp->t_inpcb : NULL);
581: if (ro6 == &sro6 && ro6->ro_rt) {
582: RTFREE(ro6->ro_rt);
583: ro6->ro_rt = NULL;
584: }
585: } else
586: #endif /* INET6 */
587: {
588: (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
589: if (ro == &sro && ro->ro_rt) {
590: RTFREE(ro->ro_rt);
591: ro->ro_rt = NULL;
592: }
593: }
594: }
595:
596: /*
597: * Create a new TCP control block, making an
598: * empty reassembly queue and hooking it to the argument
599: * protocol control block. The `inp' parameter must have
600: * come from the zone allocator set up in tcp_init().
601: */
602: struct tcpcb *
603: tcp_newtcpcb(inp)
604: struct inpcb *inp;
605: {
606: struct inp_tp *it;
607: struct tcpcb *tp;
608: #ifdef INET6
609: int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
610: #endif /* INET6 */
611:
612: it = (struct inp_tp *)inp;
613: tp = &it->tcb;
614: bzero((char *) tp, sizeof(struct tcpcb));
615: LIST_INIT(&tp->t_segq);
616: tp->t_maxseg = tp->t_maxopd =
617: #ifdef INET6
618: isipv6 ? tcp_v6mssdflt :
619: #endif /* INET6 */
620: tcp_mssdflt;
621:
622: /* Set up our timeouts. */
623: callout_init(tp->tt_rexmt = &it->inp_tp_rexmt);
624: callout_init(tp->tt_persist = &it->inp_tp_persist);
625: callout_init(tp->tt_keep = &it->inp_tp_keep);
626: callout_init(tp->tt_2msl = &it->inp_tp_2msl);
627: callout_init(tp->tt_delack = &it->inp_tp_delack);
628:
629: if (tcp_do_rfc1323)
630: tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
631: if (tcp_do_rfc1644)
632: tp->t_flags |= TF_REQ_CC;
633: tp->t_inpcb = inp; /* XXX */
634: /*
635: * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
636: * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
637: * reasonable initial retransmit time.
638: */
639: tp->t_srtt = TCPTV_SRTTBASE;
640: tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
641: tp->t_rttmin = tcp_rexmit_min;
642: tp->t_rxtcur = TCPTV_RTOBASE;
643: tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
644: tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
645: tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
646: tp->t_rcvtime = ticks;
647: tp->t_bw_rtttime = ticks;
648: /*
649: * IPv4 TTL initialization is necessary for an IPv6 socket as well,
650: * because the socket may be bound to an IPv6 wildcard address,
651: * which may match an IPv4-mapped IPv6 address.
652: */
653: inp->inp_ip_ttl = ip_defttl;
654: inp->inp_ppcb = (caddr_t)tp;
655: return (tp); /* XXX */
656: }
657:
658: /*
659: * Drop a TCP connection, reporting
660: * the specified error. If connection is synchronized,
661: * then send a RST to peer.
662: */
663: struct tcpcb *
664: tcp_drop(tp, errno)
665: struct tcpcb *tp;
666: int errno;
667: {
668: struct socket *so = tp->t_inpcb->inp_socket;
669:
670: if (TCPS_HAVERCVDSYN(tp->t_state)) {
671: tp->t_state = TCPS_CLOSED;
672: (void) tcp_output(tp);
673: tcpstat.tcps_drops++;
674: } else
675: tcpstat.tcps_conndrops++;
676: if (errno == ETIMEDOUT && tp->t_softerror)
677: errno = tp->t_softerror;
678: so->so_error = errno;
679: return (tcp_close(tp));
680: }
681:
682: /*
683: * Close a TCP control block:
684: * discard all space held by the tcp
685: * discard internet protocol block
686: * wake up any sleepers
687: */
688: struct tcpcb *
689: tcp_close(tp)
690: struct tcpcb *tp;
691: {
692: struct tseg_qent *q;
693: struct inpcb *inp = tp->t_inpcb;
694: struct socket *so = inp->inp_socket;
695: #ifdef INET6
696: int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
697: #endif /* INET6 */
698: struct rtentry *rt;
699: int dosavessthresh;
700:
701: /*
702: * Make sure that all of our timers are stopped before we
703: * delete the PCB.
704: */
705: callout_stop(tp->tt_rexmt);
706: callout_stop(tp->tt_persist);
707: callout_stop(tp->tt_keep);
708: callout_stop(tp->tt_2msl);
709: callout_stop(tp->tt_delack);
710:
711: /*
712: * If we got enough samples through the srtt filter,
713: * save the rtt and rttvar in the routing entry.
714: * 'Enough' is arbitrarily defined as the 16 samples.
715: * 16 samples is enough for the srtt filter to converge
716: * to within 5% of the correct value; fewer samples and
717: * we could save a very bogus rtt.
718: *
719: * Don't update the default route's characteristics and don't
720: * update anything that the user "locked".
721: */
722: if (tp->t_rttupdated >= 16) {
723: u_long i = 0;
724: #ifdef INET6
725: if (isipv6) {
726: struct sockaddr_in6 *sin6;
727:
728: if ((rt = inp->in6p_route.ro_rt) == NULL)
729: goto no_valid_rt;
730: sin6 = (struct sockaddr_in6 *)rt_key(rt);
731: if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
732: goto no_valid_rt;
733: }
734: else
735: #endif /* INET6 */
736: if ((rt = inp->inp_route.ro_rt) == NULL ||
737: ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
738: == INADDR_ANY)
739: goto no_valid_rt;
740:
741: if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
742: i = tp->t_srtt *
743: (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
744: if (rt->rt_rmx.rmx_rtt && i)
745: /*
746: * filter this update to half the old & half
747: * the new values, converting scale.
748: * See route.h and tcp_var.h for a
749: * description of the scaling constants.
750: */
751: rt->rt_rmx.rmx_rtt =
752: (rt->rt_rmx.rmx_rtt + i) / 2;
753: else
754: rt->rt_rmx.rmx_rtt = i;
755: tcpstat.tcps_cachedrtt++;
756: }
757: if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
758: i = tp->t_rttvar *
759: (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
760: if (rt->rt_rmx.rmx_rttvar && i)
761: rt->rt_rmx.rmx_rttvar =
762: (rt->rt_rmx.rmx_rttvar + i) / 2;
763: else
764: rt->rt_rmx.rmx_rttvar = i;
765: tcpstat.tcps_cachedrttvar++;
766: }
767: /*
768: * The old comment here said:
769: * update the pipelimit (ssthresh) if it has been updated
770: * already or if a pipesize was specified & the threshhold
771: * got below half the pipesize. I.e., wait for bad news
772: * before we start updating, then update on both good
773: * and bad news.
774: *
775: * But we want to save the ssthresh even if no pipesize is
776: * specified explicitly in the route, because such
777: * connections still have an implicit pipesize specified
778: * by the global tcp_sendspace. In the absence of a reliable
779: * way to calculate the pipesize, it will have to do.
780: */
781: i = tp->snd_ssthresh;
782: if (rt->rt_rmx.rmx_sendpipe != 0)
783: dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
784: else
785: dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
786: if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
787: i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
788: || dosavessthresh) {
789: /*
790: * convert the limit from user data bytes to
791: * packets then to packet data bytes.
792: */
793: i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
794: if (i < 2)
795: i = 2;
796: i *= (u_long)(tp->t_maxseg +
797: #ifdef INET6
798: (isipv6 ? sizeof (struct ip6_hdr) +
799: sizeof (struct tcphdr) :
800: #endif
801: sizeof (struct tcpiphdr)
802: #ifdef INET6
803: )
804: #endif
805: );
806: if (rt->rt_rmx.rmx_ssthresh)
807: rt->rt_rmx.rmx_ssthresh =
808: (rt->rt_rmx.rmx_ssthresh + i) / 2;
809: else
810: rt->rt_rmx.rmx_ssthresh = i;
811: tcpstat.tcps_cachedssthresh++;
812: }
813: }
814: no_valid_rt:
815: /* free the reassembly queue, if any */
816: while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
817: LIST_REMOVE(q, tqe_q);
818: m_freem(q->tqe_m);
819: FREE(q, M_TSEGQ);
820: tcp_reass_qsize--;
821: }
822: inp->inp_ppcb = NULL;
823: soisdisconnected(so);
824: #ifdef INET6
825: if (INP_CHECK_SOCKAF(so, AF_INET6))
826: in6_pcbdetach(inp);
827: else
828: #endif /* INET6 */
829: in_pcbdetach(inp);
830: tcpstat.tcps_closed++;
831: return ((struct tcpcb *)0);
832: }
833:
834: static __inline void
835: tcp_drain_oncpu(struct inpcbhead *head)
836: {
837: struct inpcb *inpb;
838: struct tcpcb *tcpb;
839: struct tseg_qent *te;
840:
841: LIST_FOREACH(inpb, head, inp_list) {
842: if ((tcpb = intotcpcb(inpb))) {
843: while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) {
844: LIST_REMOVE(te, tqe_q);
845: m_freem(te->tqe_m);
846: FREE(te, M_TSEGQ);
847: tcp_reass_qsize--;
848: }
849: }
850: }
851: }
852:
853: #ifdef SMP
854: struct netmsg_tcp_drain {
855: struct lwkt_msg nm_lmsg;
856: netisr_fn_t nm_handler;
857: struct inpcbhead *nm_head;
858: };
859:
860: static void
861: tcp_drain_handler(struct netmsg *msg0)
862: {
863: struct netmsg_tcp_drain *nm = (struct netmsg_tcp_drain *)msg0;
864:
865: tcp_drain_oncpu(nm->nm_head);
866: lwkt_replymsg(&msg0->nm_lmsg, 0);
867: }
868: #endif
869:
870: void
871: tcp_drain()
872: {
873: #ifdef SMP
874: int cpu;
875: #endif
876:
877: if (!do_tcpdrain)
878: return;
879:
880: /*
881: * Walk the tcpbs, if existing, and flush the reassembly queue,
882: * if there is one...
883: * XXX: The "Net/3" implementation doesn't imply that the TCP
884: * reassembly queue should be flushed, but in a situation
885: * where we're really low on mbufs, this is potentially
886: * usefull.
887: */
888: #ifdef SMP
889: for (cpu = 0; cpu < ncpus2; cpu++) {
890: struct netmsg_tcp_drain *msg;
891:
892: if (cpu == mycpu->gd_cpuid) {
893: tcp_drain_oncpu(&tcbinfo[cpu].listhead);
894: } else {
895: msg = malloc(sizeof(struct netmsg_tcp_drain),
896: M_LWKTMSG, M_NOWAIT);
897: if (!msg)
898: continue;
899: lwkt_initmsg_rp(&msg->nm_lmsg, &netisr_afree_rport,
900: CMD_NETMSG_ONCPU);
901: msg->nm_handler = tcp_drain_handler;
902: msg->nm_head = &tcbinfo[cpu].listhead;
903: lwkt_sendmsg(tcp_cport(cpu), &msg->nm_lmsg);
904: }
905: }
906: #else
907: tcp_drain_oncpu(&tcbinfo[0].listhead);
908: #endif
909: }
910:
911: /*
912: * Notify a tcp user of an asynchronous error;
913: * store error as soft error, but wake up user
914: * (for now, won't do anything until can select for soft error).
915: *
916: * Do not wake up user since there currently is no mechanism for
917: * reporting soft errors (yet - a kqueue filter may be added).
918: */
919: static void
920: tcp_notify(inp, error)
921: struct inpcb *inp;
922: int error;
923: {
924: struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
925:
926: /*
927: * Ignore some errors if we are hooked up.
928: * If connection hasn't completed, has retransmitted several times,
929: * and receives a second error, give up now. This is better
930: * than waiting a long time to establish a connection that
931: * can never complete.
932: */
933: if (tp->t_state == TCPS_ESTABLISHED &&
934: (error == EHOSTUNREACH || error == ENETUNREACH ||
935: error == EHOSTDOWN)) {
936: return;
937: } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
938: tp->t_softerror)
939: tcp_drop(tp, error);
940: else
941: tp->t_softerror = error;
942: #if 0
943: wakeup((caddr_t) &so->so_timeo);
944: sorwakeup(so);
945: sowwakeup(so);
946: #endif
947: }
948:
949: static int
950: tcp_pcblist(SYSCTL_HANDLER_ARGS)
951: {
952: int error, i, n, s;
953: struct inpcb *inp, **inp_list;
954: inp_gen_t gencnt;
955: struct xinpgen xig;
956:
957: /*
958: * The process of preparing the TCB list is too time-consuming and
959: * resource-intensive to repeat twice on every request.
960: */
961: if (req->oldptr == 0) {
962: n = tcbinfo[mycpu->gd_cpuid].ipi_count;
963: req->oldidx = 2 * (sizeof xig)
964: + (n + n/8) * sizeof(struct xtcpcb);
965: return 0;
966: }
967:
968: if (req->newptr != 0)
969: return EPERM;
970:
971: /*
972: * OK, now we're committed to doing something.
973: */
974: s = splnet();
975: gencnt = tcbinfo[mycpu->gd_cpuid].ipi_gencnt;
976: n = tcbinfo[mycpu->gd_cpuid].ipi_count;
977: splx(s);
978:
979: xig.xig_len = sizeof xig;
980: xig.xig_count = n;
981: xig.xig_gen = gencnt;
982: xig.xig_sogen = so_gencnt;
983: error = SYSCTL_OUT(req, &xig, sizeof xig);
984: if (error)
985: return error;
986:
987: inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
988: if (inp_list == 0)
989: return ENOMEM;
990:
991: s = splnet();
992: for (inp = LIST_FIRST(&tcbinfo[mycpu->gd_cpuid].listhead), i = 0;
993: inp && i < n; inp = LIST_NEXT(inp, inp_list)) {
994: if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->td, inp))
995: inp_list[i++] = inp;
996: }
997: splx(s);
998: n = i;
999:
1000: error = 0;
1001: for (i = 0; i < n; i++) {
1002: inp = inp_list[i];
1003: if (inp->inp_gencnt <= gencnt) {
1004: struct xtcpcb xt;
1005: caddr_t inp_ppcb;
1006: xt.xt_len = sizeof xt;
1007: /* XXX should avoid extra copy */
1008: bcopy(inp, &xt.xt_inp, sizeof *inp);
1009: inp_ppcb = inp->inp_ppcb;
1010: if (inp_ppcb != NULL)
1011: bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
1012: else
1013: bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1014: if (inp->inp_socket)
1015: sotoxsocket(inp->inp_socket, &xt.xt_socket);
1016: error = SYSCTL_OUT(req, &xt, sizeof xt);
1017: }
1018: }
1019: if (!error) {
1020: /*
1021: * Give the user an updated idea of our state.
1022: * If the generation differs from what we told
1023: * her before, she knows that something happened
1024: * while we were processing this request, and it
1025: * might be necessary to retry.
1026: */
1027: s = splnet();
1028: xig.xig_gen = tcbinfo[mycpu->gd_cpuid].ipi_gencnt;
1029: xig.xig_sogen = so_gencnt;
1030: xig.xig_count = tcbinfo[mycpu->gd_cpuid].ipi_count;
1031: splx(s);
1032: error = SYSCTL_OUT(req, &xig, sizeof xig);
1033: }
1034: free(inp_list, M_TEMP);
1035: return error;
1036: }
1037:
1038: SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
1039: tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1040:
1041: static int
1042: tcp_getcred(SYSCTL_HANDLER_ARGS)
1043: {
1044: struct sockaddr_in addrs[2];
1045: struct inpcb *inp;
1046: int cpu;
1047: int error, s;
1048:
1049: error = suser(req->td);
1050: if (error)
1051: return (error);
1052: error = SYSCTL_IN(req, addrs, sizeof(addrs));
1053: if (error)
1054: return (error);
1055: s = splnet();
1056:
1057: #ifdef TCP_DISTRIBUTED_TCBINFO
1058: cpu = tcp_addrcpu(addrs[1].sin_addr.s_addr, addrs[1].sin_port,
1059: addrs[0].sin_addr.s_addr, addrs[0].sin_port);
1060: #else
1061: cpu = 0;
1062: #endif
1063: inp = in_pcblookup_hash(&tcbinfo[cpu], addrs[1].sin_addr,
1064: addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
1065: if (inp == NULL || inp->inp_socket == NULL) {
1066: error = ENOENT;
1067: goto out;
1068: }
1069: error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(struct ucred));
1070: out:
1071: splx(s);
1072: return (error);
1073: }
1074:
1075: SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
1076: 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection");
1077:
1078: #ifdef INET6
1079: static int
1080: tcp6_getcred(SYSCTL_HANDLER_ARGS)
1081: {
1082: struct sockaddr_in6 addrs[2];
1083: struct inpcb *inp;
1084: int error, s, mapped = 0;
1085:
1086: error = suser(req->td);
1087: if (error)
1088: return (error);
1089: error = SYSCTL_IN(req, addrs, sizeof(addrs));
1090: if (error)
1091: return (error);
1092: if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
1093: if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
1094: mapped = 1;
1095: else
1096: return (EINVAL);
1097: }
1098: s = splnet();
1099: if (mapped == 1) {
1100: inp = in_pcblookup_hash(&tcbinfo[0],
1101: *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
1102: addrs[1].sin6_port,
1103: *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
1104: addrs[0].sin6_port,
1105: 0, NULL);
1106: } else {
1107: inp = in6_pcblookup_hash(&tcbinfo[0],
1108: &addrs[1].sin6_addr, addrs[1].sin6_port,
1109: &addrs[0].sin6_addr, addrs[0].sin6_port,
1110: 0, NULL);
1111: }
1112: if (inp == NULL || inp->inp_socket == NULL) {
1113: error = ENOENT;
1114: goto out;
1115: }
1116: error = SYSCTL_OUT(req, inp->inp_socket->so_cred,
1117: sizeof(struct ucred));
1118: out:
1119: splx(s);
1120: return (error);
1121: }
1122:
1123: SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
1124: 0, 0,
1125: tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection");
1126: #endif
1127:
1128:
1129: void
1130: tcp_ctlinput(cmd, sa, vip)
1131: int cmd;
1132: struct sockaddr *sa;
1133: void *vip;
1134: {
1135: struct ip *ip = vip;
1136: struct tcphdr *th;
1137: struct in_addr faddr;
1138: struct inpcb *inp;
1139: struct tcpcb *tp;
1140: void (*notify) (struct inpcb *, int) = tcp_notify;
1141: tcp_seq icmp_seq;
1142: int cpu;
1143: int s;
1144:
1145: faddr = ((struct sockaddr_in *)sa)->sin_addr;
1146: if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1147: return;
1148:
1149: if (cmd == PRC_QUENCH)
1150: notify = tcp_quench;
1151: else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1152: cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
1153: notify = tcp_drop_syn_sent;
1154: else if (cmd == PRC_MSGSIZE)
1155: notify = tcp_mtudisc;
1156: else if (PRC_IS_REDIRECT(cmd)) {
1157: ip = 0;
1158: notify = in_rtchange;
1159: } else if (cmd == PRC_HOSTDEAD)
1160: ip = 0;
1161: else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1162: return;
1163: if (ip) {
1164: s = splnet();
1165: th = (struct tcphdr *)((caddr_t)ip
1166: + (IP_VHL_HL(ip->ip_vhl) << 2));
1167: #ifdef TCP_DISTRIBUTED_TCBINFO
1168: cpu = tcp_addrcpu(faddr.s_addr, th->th_dport,
1169: ip->ip_src.s_addr, th->th_sport);
1170: #else
1171: cpu = 0;
1172: #endif
1173: inp = in_pcblookup_hash(&tcbinfo[cpu], faddr, th->th_dport,
1174: ip->ip_src, th->th_sport, 0, NULL);
1175: if (inp != NULL && inp->inp_socket != NULL) {
1176: icmp_seq = htonl(th->th_seq);
1177: tp = intotcpcb(inp);
1178: if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1179: SEQ_LT(icmp_seq, tp->snd_max))
1180: (*notify)(inp, inetctlerrmap[cmd]);
1181: } else {
1182: struct in_conninfo inc;
1183:
1184: inc.inc_fport = th->th_dport;
1185: inc.inc_lport = th->th_sport;
1186: inc.inc_faddr = faddr;
1187: inc.inc_laddr = ip->ip_src;
1188: #ifdef INET6
1189: inc.inc_isipv6 = 0;
1190: #endif
1191: syncache_unreach(&inc, th);
1192: }
1193: splx(s);
1194: } else {
1195: for (cpu = 0; cpu < ncpus2; cpu++) {
1196: in_pcbnotifyall(&tcbinfo[cpu].listhead, faddr,
1197: inetctlerrmap[cmd], notify);
1198: }
1199: }
1200: }
1201:
1202: #ifdef INET6
1203: void
1204: tcp6_ctlinput(cmd, sa, d)
1205: int cmd;
1206: struct sockaddr *sa;
1207: void *d;
1208: {
1209: struct tcphdr th;
1210: void (*notify) (struct inpcb *, int) = tcp_notify;
1211: struct ip6_hdr *ip6;
1212: struct mbuf *m;
1213: struct ip6ctlparam *ip6cp = NULL;
1214: const struct sockaddr_in6 *sa6_src = NULL;
1215: int off;
1216: struct tcp_portonly {
1217: u_int16_t th_sport;
1218: u_int16_t th_dport;
1219: } *thp;
1220:
1221: if (sa->sa_family != AF_INET6 ||
1222: sa->sa_len != sizeof(struct sockaddr_in6))
1223: return;
1224:
1225: if (cmd == PRC_QUENCH)
1226: notify = tcp_quench;
1227: else if (cmd == PRC_MSGSIZE)
1228: notify = tcp_mtudisc;
1229: else if (!PRC_IS_REDIRECT(cmd) &&
1230: ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1231: return;
1232:
1233: /* if the parameter is from icmp6, decode it. */
1234: if (d != NULL) {
1235: ip6cp = (struct ip6ctlparam *)d;
1236: m = ip6cp->ip6c_m;
1237: ip6 = ip6cp->ip6c_ip6;
1238: off = ip6cp->ip6c_off;
1239: sa6_src = ip6cp->ip6c_src;
1240: } else {
1241: m = NULL;
1242: ip6 = NULL;
1243: off = 0; /* fool gcc */
1244: sa6_src = &sa6_any;
1245: }
1246:
1247: if (ip6) {
1248: struct in_conninfo inc;
1249: /*
1250: * XXX: We assume that when IPV6 is non NULL,
1251: * M and OFF are valid.
1252: */
1253:
1254: /* check if we can safely examine src and dst ports */
1255: if (m->m_pkthdr.len < off + sizeof(*thp))
1256: return;
1257:
1258: bzero(&th, sizeof(th));
1259: m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1260:
1261: in6_pcbnotify(&tcbinfo[0].listhead, sa, th.th_dport,
1262: (struct sockaddr *)ip6cp->ip6c_src,
1263: th.th_sport, cmd, notify);
1264:
1265: inc.inc_fport = th.th_dport;
1266: inc.inc_lport = th.th_sport;
1267: inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
1268: inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
1269: inc.inc_isipv6 = 1;
1270: syncache_unreach(&inc, &th);
1271: } else
1272: in6_pcbnotify(&tcbinfo[0].listhead, sa, 0,
1273: (const struct sockaddr *)sa6_src, 0, cmd, notify);
1274: }
1275: #endif /* INET6 */
1276:
1277:
1278: /*
1279: * Following is where TCP initial sequence number generation occurs.
1280: *
1281: * There are two places where we must use initial sequence numbers:
1282: * 1. In SYN-ACK packets.
1283: * 2. In SYN packets.
1284: *
1285: * All ISNs for SYN-ACK packets are generated by the syncache. See
1286: * tcp_syncache.c for details.
1287: *
1288: * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1289: * depends on this property. In addition, these ISNs should be
1290: * unguessable so as to prevent connection hijacking. To satisfy
1291: * the requirements of this situation, the algorithm outlined in
1292: * RFC 1948 is used to generate sequence numbers.
1293: *
1294: * Implementation details:
1295: *
1296: * Time is based off the system timer, and is corrected so that it
1297: * increases by one megabyte per second. This allows for proper
1298: * recycling on high speed LANs while still leaving over an hour
1299: * before rollover.
1300: *
1301: * net.inet.tcp.isn_reseed_interval controls the number of seconds
1302: * between seeding of isn_secret. This is normally set to zero,
1303: * as reseeding should not be necessary.
1304: *
1305: */
1306:
1307: #define ISN_BYTES_PER_SECOND 1048576
1308:
1309: u_char isn_secret[32];
1310: int isn_last_reseed;
1311: MD5_CTX isn_ctx;
1312:
1313: tcp_seq
1314: tcp_new_isn(tp)
1315: struct tcpcb *tp;
1316: {
1317: u_int32_t md5_buffer[4];
1318: tcp_seq new_isn;
1319:
1320: /* Seed if this is the first use, reseed if requested. */
1321: if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
1322: (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
1323: < (u_int)ticks))) {
1324: read_random_unlimited(&isn_secret, sizeof(isn_secret));
1325: isn_last_reseed = ticks;
1326: }
1327:
1328: /* Compute the md5 hash and return the ISN. */
1329: MD5Init(&isn_ctx);
1330: MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1331: MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1332: #ifdef INET6
1333: if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1334: MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1335: sizeof(struct in6_addr));
1336: MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1337: sizeof(struct in6_addr));
1338: } else
1339: #endif
1340: {
1341: MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1342: sizeof(struct in_addr));
1343: MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1344: sizeof(struct in_addr));
1345: }
1346: MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
1347: MD5Final((u_char *) &md5_buffer, &isn_ctx);
1348: new_isn = (tcp_seq) md5_buffer[0];
1349: new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
1350: return new_isn;
1351: }
1352:
1353: /*
1354: * When a source quench is received, close congestion window
1355: * to one segment. We will gradually open it again as we proceed.
1356: */
1357: void
1358: tcp_quench(inp, errno)
1359: struct inpcb *inp;
1360: int errno;
1361: {
1362: struct tcpcb *tp = intotcpcb(inp);
1363:
1364: if (tp)
1365: tp->snd_cwnd = tp->t_maxseg;
1366: }
1367:
1368: /*
1369: * When a specific ICMP unreachable message is received and the
1370: * connection state is SYN-SENT, drop the connection. This behavior
1371: * is controlled by the icmp_may_rst sysctl.
1372: */
1373: void
1374: tcp_drop_syn_sent(inp, errno)
1375: struct inpcb *inp;
1376: int errno;
1377: {
1378: struct tcpcb *tp = intotcpcb(inp);
1379:
1380: if (tp && tp->t_state == TCPS_SYN_SENT)
1381: tcp_drop(tp, errno);
1382: }
1383:
1384: /*
1385: * When `need fragmentation' ICMP is received, update our idea of the MSS
1386: * based on the new value in the route. Also nudge TCP to send something,
1387: * since we know the packet we just sent was dropped.
1388: * This duplicates some code in the tcp_mss() function in tcp_input.c.
1389: */
1390: void
1391: tcp_mtudisc(inp, errno)
1392: struct inpcb *inp;
1393: int errno;
1394: {
1395: struct tcpcb *tp = intotcpcb(inp);
1396: struct rtentry *rt;
1397: struct rmxp_tao *taop;
1398: struct socket *so = inp->inp_socket;
1399: int offered;
1400: int mss;
1401: #ifdef INET6
1402: int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1403: #endif /* INET6 */
1404:
1405: if (tp) {
1406: #ifdef INET6
1407: if (isipv6)
1408: rt = tcp_rtlookup6(&inp->inp_inc);
1409: else
1410: #endif /* INET6 */
1411: rt = tcp_rtlookup(&inp->inp_inc);
1412: if (!rt || !rt->rt_rmx.rmx_mtu) {
1413: tp->t_maxopd = tp->t_maxseg =
1414: #ifdef INET6
1415: isipv6 ? tcp_v6mssdflt :
1416: #endif /* INET6 */
1417: tcp_mssdflt;
1418: return;
1419: }
1420: taop = rmx_taop(rt->rt_rmx);
1421: offered = taop->tao_mssopt;
1422: mss = rt->rt_rmx.rmx_mtu -
1423: #ifdef INET6
1424: (isipv6 ?
1425: sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1426: #endif /* INET6 */
1427: sizeof(struct tcpiphdr)
1428: #ifdef INET6
1429: )
1430: #endif /* INET6 */
1431: ;
1432:
1433: if (offered)
1434: mss = min(mss, offered);
1435: /*
1436: * XXX - The above conditional probably violates the TCP
1437: * spec. The problem is that, since we don't know the
1438: * other end's MSS, we are supposed to use a conservative
1439: * default. But, if we do that, then MTU discovery will
1440: * never actually take place, because the conservative
1441: * default is much less than the MTUs typically seen
1442: * on the Internet today. For the moment, we'll sweep
1443: * this under the carpet.
1444: *
1445: * The conservative default might not actually be a problem
1446: * if the only case this occurs is when sending an initial
1447: * SYN with options and data to a host we've never talked
1448: * to before. Then, they will reply with an MSS value which
1449: * will get recorded and the new parameters should get
1450: * recomputed. For Further Study.
1451: */
1452: if (tp->t_maxopd <= mss)
1453: return;
1454: tp->t_maxopd = mss;
1455:
1456: if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1457: (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1458: mss -= TCPOLEN_TSTAMP_APPA;
1459: if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1460: (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1461: mss -= TCPOLEN_CC_APPA;
1462: #if (MCLBYTES & (MCLBYTES - 1)) == 0
1463: if (mss > MCLBYTES)
1464: mss &= ~(MCLBYTES-1);
1465: #else
1466: if (mss > MCLBYTES)
1467: mss = mss / MCLBYTES * MCLBYTES;
1468: #endif
1469: if (so->so_snd.sb_hiwat < mss)
1470: mss = so->so_snd.sb_hiwat;
1471:
1472: tp->t_maxseg = mss;
1473:
1474: tcpstat.tcps_mturesent++;
1475: tp->t_rtttime = 0;
1476: tp->snd_nxt = tp->snd_una;
1477: tcp_output(tp);
1478: }
1479: }
1480:
1481: /*
1482: * Look-up the routing entry to the peer of this inpcb. If no route
1483: * is found and it cannot be allocated the return NULL. This routine
1484: * is called by TCP routines that access the rmx structure and by tcp_mss
1485: * to get the interface MTU.
1486: */
1487: struct rtentry *
1488: tcp_rtlookup(inc)
1489: struct in_conninfo *inc;
1490: {
1491: struct route *ro;
1492: struct rtentry *rt;
1493:
1494: ro = &inc->inc_route;
1495: rt = ro->ro_rt;
1496: if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1497: /* No route yet, so try to acquire one */
1498: if (inc->inc_faddr.s_addr != INADDR_ANY) {
1499: ro->ro_dst.sa_family = AF_INET;
1500: ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1501: ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1502: inc->inc_faddr;
1503: rtalloc(ro);
1504: rt = ro->ro_rt;
1505: }
1506: }
1507: return rt;
1508: }
1509:
1510: #ifdef INET6
1511: struct rtentry *
1512: tcp_rtlookup6(inc)
1513: struct in_conninfo *inc;
1514: {
1515: struct route_in6 *ro6;
1516: struct rtentry *rt;
1517:
1518: ro6 = &inc->inc6_route;
1519: rt = ro6->ro_rt;
1520: if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1521: /* No route yet, so try to acquire one */
1522: if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
1523: ro6->ro_dst.sin6_family = AF_INET6;
1524: ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
1525: ro6->ro_dst.sin6_addr = inc->inc6_faddr;
1526: rtalloc((struct route *)ro6);
1527: rt = ro6->ro_rt;
1528: }
1529: }
1530: return rt;
1531: }
1532: #endif /* INET6 */
1533:
1534: #ifdef IPSEC
1535: /* compute ESP/AH header size for TCP, including outer IP header. */
1536: size_t
1537: ipsec_hdrsiz_tcp(tp)
1538: struct tcpcb *tp;
1539: {
1540: struct inpcb *inp;
1541: struct mbuf *m;
1542: size_t hdrsiz;
1543: struct ip *ip;
1544: #ifdef INET6
1545: struct ip6_hdr *ip6;
1546: #endif /* INET6 */
1547: struct tcphdr *th;
1548:
1549: if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1550: return 0;
1551: MGETHDR(m, M_DONTWAIT, MT_DATA);
1552: if (!m)
1553: return 0;
1554:
1555: #ifdef INET6
1556: if ((inp->inp_vflag & INP_IPV6) != 0) {
1557: ip6 = mtod(m, struct ip6_hdr *);
1558: th = (struct tcphdr *)(ip6 + 1);
1559: m->m_pkthdr.len = m->m_len =
1560: sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1561: tcp_fillheaders(tp, ip6, th);
1562: hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1563: } else
1564: #endif /* INET6 */
1565: {
1566: ip = mtod(m, struct ip *);
1567: th = (struct tcphdr *)(ip + 1);
1568: m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1569: tcp_fillheaders(tp, ip, th);
1570: hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1571: }
1572:
1573: m_free(m);
1574: return hdrsiz;
1575: }
1576: #endif /*IPSEC*/
1577:
1578: /*
1579: * Return a pointer to the cached information about the remote host.
1580: * The cached information is stored in the protocol specific part of
1581: * the route metrics.
1582: */
1583: struct rmxp_tao *
1584: tcp_gettaocache(inc)
1585: struct in_conninfo *inc;
1586: {
1587: struct rtentry *rt;
1588:
1589: #ifdef INET6
1590: if (inc->inc_isipv6)
1591: rt = tcp_rtlookup6(inc);
1592: else
1593: #endif /* INET6 */
1594: rt = tcp_rtlookup(inc);
1595:
1596: /* Make sure this is a host route and is up. */
1597: if (rt == NULL ||
1598: (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1599: return NULL;
1600:
1601: return rmx_taop(rt->rt_rmx);
1602: }
1603:
1604: /*
1605: * Clear all the TAO cache entries, called from tcp_init.
1606: *
1607: * XXX
1608: * This routine is just an empty one, because we assume that the routing
1609: * routing tables are initialized at the same time when TCP, so there is
1610: * nothing in the cache left over.
1611: */
1612: static void
1613: tcp_cleartaocache()
1614: {
1615: }
1616:
1617: /*
1618: * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1619: *
1620: * This code attempts to calculate the bandwidth-delay product as a
1621: * means of determining the optimal window size to maximize bandwidth,
1622: * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1623: * routers. This code also does a fairly good job keeping RTTs in check
1624: * across slow links like modems. We implement an algorithm which is very
1625: * similar (but not meant to be) TCP/Vegas. The code operates on the
1626: * transmitter side of a TCP connection and so only effects the transmit
1627: * side of the connection.
1628: *
1629: * BACKGROUND: TCP makes no provision for the management of buffer space
1630: * at the end points or at the intermediate routers and switches. A TCP
1631: * stream, whether using NewReno or not, will eventually buffer as
1632: * many packets as it is able and the only reason this typically works is
1633: * due to the fairly small default buffers made available for a connection
1634: * (typicaly 16K or 32K). As machines use larger windows and/or window
1635: * scaling it is now fairly easy for even a single TCP connection to blow-out
1636: * all available buffer space not only on the local interface, but on
1637: * intermediate routers and switches as well. NewReno makes a misguided
1638: * attempt to 'solve' this problem by waiting for an actual failure to occur,
1639: * then backing off, then steadily increasing the window again until another
1640: * failure occurs, ad-infinitum. This results in terrible oscillation that
1641: * is only made worse as network loads increase and the idea of intentionally
1642: * blowing out network buffers is, frankly, a terrible way to manage network
1643: * resources.
1644: *
1645: * It is far better to limit the transmit window prior to the failure
1646: * condition being achieved. There are two general ways to do this: First
1647: * you can 'scan' through different transmit window sizes and locate the
1648: * point where the RTT stops increasing, indicating that you have filled the
1649: * pipe, then scan backwards until you note that RTT stops decreasing, then
1650: * repeat ad-infinitum. This method works in principle but has severe
1651: * implementation issues due to RTT variances, timer granularity, and
1652: * instability in the algorithm which can lead to many false positives and
1653: * create oscillations as well as interact badly with other TCP streams
1654: * implementing the same algorithm.
1655: *
1656: * The second method is to limit the window to the bandwidth delay product
1657: * of the link. This is the method we implement. RTT variances and our
1658: * own manipulation of the congestion window, bwnd, can potentially
1659: * destabilize the algorithm. For this reason we have to stabilize the
1660: * elements used to calculate the window. We do this by using the minimum
1661: * observed RTT, the long term average of the observed bandwidth, and
1662: * by adding two segments worth of slop. It isn't perfect but it is able
1663: * to react to changing conditions and gives us a very stable basis on
1664: * which to extend the algorithm.
1665: */
1666: void
1667: tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1668: {
1669: u_long bw;
1670: u_long bwnd;
1671: int save_ticks;
1672:
1673: /*
1674: * If inflight_enable is disabled in the middle of a tcp connection,
1675: * make sure snd_bwnd is effectively disabled.
1676: */
1677: if (tcp_inflight_enable == 0) {
1678: tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1679: tp->snd_bandwidth = 0;
1680: return;
1681: }
1682:
1683: /*
1684: * Figure out the bandwidth. Due to the tick granularity this
1685: * is a very rough number and it MUST be averaged over a fairly
1686: * long period of time. XXX we need to take into account a link
1687: * that is not using all available bandwidth, but for now our
1688: * slop will ramp us up if this case occurs and the bandwidth later
1689: * increases.
1690: *
1691: * Note: if ticks rollover 'bw' may wind up negative. We must
1692: * effectively reset t_bw_rtttime for this case.
1693: */
1694: save_ticks = ticks;
1695: if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1696: return;
1697:
1698: bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1699: (save_ticks - tp->t_bw_rtttime);
1700: tp->t_bw_rtttime = save_ticks;
1701: tp->t_bw_rtseq = ack_seq;
1702: if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1703: return;
1704: bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1705:
1706: tp->snd_bandwidth = bw;
1707:
1708: /*
1709: * Calculate the semi-static bandwidth delay product, plus two maximal
1710: * segments. The additional slop puts us squarely in the sweet
1711: * spot and also handles the bandwidth run-up case. Without the
1712: * slop we could be locking ourselves into a lower bandwidth.
1713: *
1714: * Situations Handled:
1715: * (1) Prevents over-queueing of packets on LANs, especially on
1716: * high speed LANs, allowing larger TCP buffers to be
1717: * specified, and also does a good job preventing
1718: * over-queueing of packets over choke points like modems
1719: * (at least for the transmit side).
1720: *
1721: * (2) Is able to handle changing network loads (bandwidth
1722: * drops so bwnd drops, bandwidth increases so bwnd
1723: * increases).
1724: *
1725: * (3) Theoretically should stabilize in the face of multiple
1726: * connections implementing the same algorithm (this may need
1727: * a little work).
1728: *
1729: * (4) Stability value (defaults to 20 = 2 maximal packets) can
1730: * be adjusted with a sysctl but typically only needs to be on
1731: * very slow connections. A value no smaller then 5 should
1732: * be used, but only reduce this default if you have no other
1733: * choice.
1734: */
1735: #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
1736: bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * (int)tp->t_maxseg / 10;
1737: #undef USERTT
1738:
1739: if (tcp_inflight_debug > 0) {
1740: static int ltime;
1741: if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1742: ltime = ticks;
1743: printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1744: tp,
1745: bw,
1746: tp->t_rttbest,
1747: tp->t_srtt,
1748: bwnd
1749: );
1750: }
1751: }
1752: if ((long)bwnd < tcp_inflight_min)
1753: bwnd = tcp_inflight_min;
1754: if (bwnd > tcp_inflight_max)
1755: bwnd = tcp_inflight_max;
1756: if ((long)bwnd < tp->t_maxseg * 2)
1757: bwnd = tp->t_maxseg * 2;
1758: tp->snd_bwnd = bwnd;
1759: }