TCP max seg size option handling badly broken

cak at Purdue.ARPA cak at Purdue.ARPA
Thu Mar 22 06:13:00 AEST 1984


From:  Christopher A Kent <cak at Purdue.ARPA>

Description:
	Handling of the TCP maximum segment size option is broken in 
	many respects. Since they are all related, this is submitted 
	as just one fix.

	On output, the max seg size is always offered as 1024. This
	causes IP fragmentation overhead for networks that do not
	support this large a packet; it is a particularly bad 
	value for the Arpanet.
	
	On input, if a connection is in the LISTEN state (e.g., a
	server), an incoming maximum segment size option is ignored. The
	maximum segment size option is accepted on all packets, contrary
	to the spec which says it is only acceptable on packets with SYN.

	Both of these values should be tuned to the mtu of the interface
	being used for the connection; if the mtu is larger, use
	the offered value; otherwise set it to the mtu minus headers.

Repeat-By:
	Connect to various sites with different mtu or max seg options,
	and look at the tcpcb's with adb. Of particular interest are
	sites which have very small mtu networks attached.
	
Fix:
	The fix consists of several pieces. A new routine, tcp_getif(),
	is introduced, which returns a pointer to the interface being
	used for this connection. When initiating an outgoing TCP
	connection, tcp_getif() is called, and the proferred max
	seg size is tuned to the mtu of the device.

	When a LISTENing connection is probed by a SYN packet, we can't
	process the incoming options until a tcpcb has been allocated,
	or the process of allocation will overwrite the processed
	options with the default values.

	Whenever a connection finally enters the ESTABLISHED state, 
	compare the current value of t_maxseg to the mtu of the 
	interface being used and adjust it if necessary.

	(This set of fixes draws inspiration from an earlier fix
	from gilligan at sri-spam; unfortunately his called tcp_getif()
	at the time socket() was called, before bind(), so there
	was never an interface/route registered, and default values
	were always used; he also missed several of the above cases.)

	The following context diffs (from the original files) show the 
	necessary changes (don't trust the line numbers):

*** tcp_input.c.v0
--- tcp_input.c	
***************
*** 49,54
  	short ostate;
  	struct in_addr laddr;
  	int dropsocket = 0;
  
  	/*
  	 * Get IP and TCP header together in first mbuf.

--- 50,58 -----
  	short ostate;
  	struct in_addr laddr;
  	int dropsocket = 0;
+ 	struct ifnet *ifp;
+ 	struct ifnet *tcp_getif();
+ 	u_short ifmss;
  
  	/*
  	 * Get IP and TCP header together in first mbuf.
***************
*** 185,194
  	/*
  	 * Process options.
  	 */
! 	if (om) {
! 		tcp_dooptions(tp, om);
! 		om = 0;
! 	}
  
  	/*
  	 * Calculate amount of space in receive window,

--- 189,200 -----
  	/*
  	 * Process options.
  	 */
! 	if (om) 
! 		/* if in LISTEN, template hasn't been filled in yet */
! 		if (tp->t_state != TCPS_LISTEN){
! 			tcp_dooptions(tp, ti, om);
! 			om = 0;
! 		}
  
  	/*
  	 * Calculate amount of space in receive window,
***************
*** 246,251
  			tp = 0;
  			goto drop;
  		}
  		tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
  		tp->irs = ti->ti_seq;
  		tcp_sendseqinit(tp);

--- 252,262 -----
  			tp = 0;
  			goto drop;
  		}
+ 		/* finally, we can do the options */
+ 		if (om){
+ 			tcp_dooptions(tp, ti, om);
+ 			om = 0;
+ 		}
  		tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
  		tp->irs = ti->ti_seq;
  		tcp_sendseqinit(tp);
***************
*** 292,297
  			soisconnected(so);
  			tp->t_state = TCPS_ESTABLISHED;
  			(void) tcp_reass(tp, (struct tcpiphdr *)0);
  		} else
  			tp->t_state = TCPS_SYN_RECEIVED;
  		goto trimthenstep6;

--- 303,321 -----
  			soisconnected(so);
  			tp->t_state = TCPS_ESTABLISHED;
  			(void) tcp_reass(tp, (struct tcpiphdr *)0);
+ 			/* 
+ 			 * Tune maximum TCP segment size to i/f mtu
+ 			 */
+ 			ifp = (struct ifnet *) tcp_getif(tp);
+ 			if (ifp != (struct ifnet *)0){
+ 				ifmss = ifp->if_mtu - sizeof(struct tcpiphdr);
+ 				if (tcpprintfs)
+ 					printf("tcp trim1 (mtu,ms)==(%d,%d) -> ",
+ 						ifmss, tp->t_maxseg);
+ 				tp->t_maxseg = MIN(ifmss, tp->t_maxseg);
+ 				if (tcpprintfs)
+ 					printf("%d\n", tp->t_maxseg);
+ 			}
  		} else
  			tp->t_state = TCPS_SYN_RECEIVED;
  		goto trimthenstep6;
***************
*** 452,457
  		soisconnected(so);
  		tp->t_state = TCPS_ESTABLISHED;
  		(void) tcp_reass(tp, (struct tcpiphdr *)0);
  		tp->snd_wl1 = ti->ti_seq - 1;
  		/* fall into ... */
  

--- 476,494 -----
  		soisconnected(so);
  		tp->t_state = TCPS_ESTABLISHED;
  		(void) tcp_reass(tp, (struct tcpiphdr *)0);
+ 		/* 
+ 		 * Tune maximum TCP segment size to i/f mtu
+ 		 */
+ 		ifp = (struct ifnet *) tcp_getif(tp);
+ 		if (ifp != (struct ifnet *)0){
+ 			ifmss = ifp->if_mtu - sizeof(struct tcpiphdr);
+ 			if (tcpprintfs)
+ 				printf("tcp trim2 (mtu,ms)==(%d,%d) -> ",
+ 					ifmss, tp->t_maxseg);
+ 			tp->t_maxseg = MIN(ifmss, tp->t_maxseg);
+ 			if (tcpprintfs)
+ 				printf("%d\n", tp->t_maxseg);
+ 		}
  		tp->snd_wl1 = ti->ti_seq - 1;
  		/* fall into ... */
  
***************
*** 759,765
  	return;
  }
  
! tcp_dooptions(tp, om)
  	struct tcpcb *tp;
  	struct mbuf *om;
  {

--- 796,802 -----
  	return;
  }
  
! tcp_dooptions(tp, ti, om)
  	struct tcpcb *tp;
  	struct tcpiphdr *ti;
  	struct mbuf *om;
***************
*** 761,766
  
  tcp_dooptions(tp, om)
  	struct tcpcb *tp;
  	struct mbuf *om;
  {
  	register u_char *cp;

--- 798,804 -----
  
  tcp_dooptions(tp, ti, om)
  	struct tcpcb *tp;
+ 	struct tcpiphdr *ti;
  	struct mbuf *om;
  {
  	register u_char *cp;
***************
*** 787,792
  		case TCPOPT_MAXSEG:
  			if (optlen != 4)
  				continue;
  			tp->t_maxseg = *(u_short *)(cp + 2);
  			tp->t_maxseg = ntohs((u_short)tp->t_maxseg);
  			break;

--- 825,832 -----
  		case TCPOPT_MAXSEG:
  			if (optlen != 4)
  				continue;
+ 			if ((ti->ti_flags & TH_SYN) != TH_SYN)
+ 				continue;
  			tp->t_maxseg = *(u_short *)(cp + 2);
  			tp->t_maxseg = ntohs((u_short)tp->t_maxseg);
  			break;
***************
*** tcp_output.c.v0
--- tcp_output.c	
***************
*** 8,13
  #include "../h/socketvar.h"
  #include "../h/errno.h"
  
  #include "../net/route.h"
  
  #include "../netinet/in.h"

--- 9,15 -----
  #include "../h/socketvar.h"
  #include "../h/errno.h"
  
+ #include "../net/if.h"
  #include "../net/route.h"
  
  #include "../netinet/in.h"
***************
*** 37,42
  {
  	register struct socket *so = tp->t_inpcb->inp_socket;
  	register int len;
  	struct mbuf *m0;
  	int off, flags, win, error;
  	register struct mbuf *m;

--- 39,45 -----
  {
  	register struct socket *so = tp->t_inpcb->inp_socket;
  	register int len;
+ 	register struct ifnet *ifp;
  	struct mbuf *m0;
  	int off, flags, win, error;
  	register struct mbuf *m;
***************
*** 44,49
  	u_char *opt;
  	unsigned optlen = 0;
  	int sendalot;
  
  	/*
  	 * Determine length of data that should be transmitted,

--- 47,54 -----
  	u_char *opt;
  	unsigned optlen = 0;
  	int sendalot;
+ 	struct ifnet *tcp_getif();
+ 	u_short ifmss;
  
  	/*
  	 * Determine length of data that should be transmitted,
***************
*** 172,178
  			goto noopt;
  		opt = tcp_initopt;
  		optlen = sizeof (tcp_initopt);
! 		*(u_short *)(opt + 2) = MIN(so->so_rcv.sb_hiwat / 2, 1024);
  		*(u_short *)(opt + 2) = htons(*(u_short *)(opt + 2));
  	} else {
  		if (tp->t_tcpopt == 0)

--- 177,194 -----
  			goto noopt;
  		opt = tcp_initopt;
  		optlen = sizeof (tcp_initopt);
! 
! 		/* 
! 		 * Tune max seg size to mtu of device this connection
! 		 * will run on, in order to avoid IP fragmentation
! 		 * as much as possible. Subtract off standard TCP/IP header.
! 		 */
! 
! 		ifp = tcp_getif(tp);
! 		if (ifp == (struct ifnet *) 0)
! 			goto noopt;
! 		ifmss = ifp->if_mtu - sizeof(struct tcpiphdr);
!  		*(u_short *)(opt + 2) = MIN(so->so_rcv.sb_hiwat / 2, ifmss);
  		*(u_short *)(opt + 2) = htons(*(u_short *)(opt + 2));
  	} else {
  		if (tp->t_tcpopt == 0)
***************

*** tcp_subr.c.v0
--- tcp_subr.c
***************
*** 151,156
  {
  	struct mbuf *m = m_getclr(M_DONTWAIT, MT_PCB);
  	register struct tcpcb *tp;
  
  	if (m == NULL)
  		return ((struct tcpcb *)0);

--- 151,159 -----
  {
  	struct mbuf *m = m_getclr(M_DONTWAIT, MT_PCB);
  	register struct tcpcb *tp;
+ 	struct ifnet *ifp;
+ 	struct ifnet *tcp_getif();
+ 	u_short ifmss;
  
  	if (m == NULL)
  		return ((struct tcpcb *)0);
***************
*** 156,161
  		return ((struct tcpcb *)0);
  	tp = mtod(m, struct tcpcb *);
  	tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp;
  	/*
  	 * If the default maximum IP packet size is 576 bytes
  	 * and a standard IP header is 20 bytes, with a TCP

--- 159,165 -----
  		return ((struct tcpcb *)0);
  	tp = mtod(m, struct tcpcb *);
  	tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp;
+ 
  	/*
  	 * If the default maximum IP packet size is 576 bytes
  	 * and a standard IP header is 20 bytes, with a TCP
***************
*** 162,167
  	 * header of 20 bytes plus the options necessary to
  	 * upgrade it to something higher, then initialize the
  	 * maximum segment size to 576 - (20 + 20 + 8 + slop).
  	 */
  	tp->t_maxseg = 512;		/* satisfy the rest of the world */
  	tp->t_flags = 0;		/* sends options! */

--- 166,173 -----
  	 * header of 20 bytes plus the options necessary to
  	 * upgrade it to something higher, then initialize the
  	 * maximum segment size to 576 - (20 + 20 + 8 + slop).
+ 	 * But Postel says make it 536; see <INC-PROJECT, MAX-SEG-SIZ.NLS.14>
+ 	 * and letter of 7 Nov 1983.
  	 */
  
  	tp->t_flags = 0;		/* sends options! */
***************
*** 163,169
  	 * upgrade it to something higher, then initialize the
  	 * maximum segment size to 576 - (20 + 20 + 8 + slop).
  	 */
! 	tp->t_maxseg = 512;		/* satisfy the rest of the world */
  	tp->t_flags = 0;		/* sends options! */
  	tp->t_inpcb = inp;
  	inp->inp_ppcb = (caddr_t)tp;

--- 169,175 -----
  	 * But Postel says make it 536; see <INC-PROJECT, MAX-SEG-SIZ.NLS.14>
  	 * and letter of 7 Nov 1983.
  	 */
! 
  	tp->t_flags = 0;		/* sends options! */
  	tp->t_maxseg = 536;		/* satisfy the rest of the world */
  	tp->t_inpcb = inp;
***************
*** 165,170
  	 */
  	tp->t_maxseg = 512;		/* satisfy the rest of the world */
  	tp->t_flags = 0;		/* sends options! */
  	tp->t_inpcb = inp;
  	inp->inp_ppcb = (caddr_t)tp;
  	return (tp);

--- 171,177 -----
  	 */
  
  	tp->t_flags = 0;		/* sends options! */
+ 	tp->t_maxseg = 536;		/* satisfy the rest of the world */
  	tp->t_inpcb = inp;
  	inp->inp_ppcb = (caddr_t)tp;
  	return (tp);
***************
*** 264,267
  		sin = &((struct icmp *)arg)->icmp_ip.ip_dst;
  		in_pcbnotify(&tcb, sin, (int)inetctlerrmap[cmd], tcp_abort);
  	}
  }

--- 271,313 -----
  		sin = &((struct icmp *)arg)->icmp_ip.ip_dst;
  		in_pcbnotify(&tcb, sin, (int)inetctlerrmap[cmd], tcp_abort);
  	}
+ }
+ 
+ /* 
+  * Given a tcpcb, discover route and determine interface to send
+  * packets over.
+  */
+ 
+ struct ifnet *
+ tcp_getif(tp)
+ struct tcpcb *tp;
+ {
+ 	struct route iproute;
+ 	register struct route *ro;
+ 	struct in_addr faddr;
+ 	register struct ifnet *ifp;
+ 
+ 	ro = &iproute;
+ 	bzero((caddr_t)ro, sizeof(*ro));
+ 	ro->ro_dst.sa_family = AF_INET;
+ 	faddr = tp->t_inpcb->inp_faddr;
+ 		if (faddr.s_addr == 0)
+ 			return (struct ifnet *)0;
+ 
+ 	((struct sockaddr_in *) &iproute.ro_dst)->sin_addr = faddr;
+ 	rtalloc(ro);
+ 	if ((ro->ro_rt == 0) || (ifp = ro->ro_rt->rt_ifp) == 0)
+ 		return (struct ifnet *)0;
+ 	rtfree(ro->ro_rt);
+ 	return ifp;
  }

----------



More information about the Comp.unix.wizards mailing list