File:  [DragonFly] / src / sys / net / ip_mroute / ip_mroute.c
Revision 1.9: download - view: text, annotated - select for diffs
Mon Mar 8 07:43:44 2004 UTC (10 years, 1 month ago) by hsu
Branches: MAIN
CVS tags: HEAD
To comply with the spec, do not copy the TOS from the outer IP
header to the inner IP header of the PIM Register if this is a PIM
Null-Register message.

Submitted by:	Pavlin Radoslavov <pavlin@icir.org>

    1: /*
    2:  * IP multicast forwarding procedures
    3:  *
    4:  * Written by David Waitzman, BBN Labs, August 1988.
    5:  * Modified by Steve Deering, Stanford, February 1989.
    6:  * Modified by Mark J. Steiglitz, Stanford, May, 1991
    7:  * Modified by Van Jacobson, LBL, January 1993
    8:  * Modified by Ajit Thyagarajan, PARC, August 1993
    9:  * Modified by Bill Fenner, PARC, April 1995
   10:  * Modified by Ahmed Helmy, SGI, June 1996
   11:  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
   12:  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
   13:  * Modified by Hitoshi Asaeda, WIDE, August 2000
   14:  * Modified by Pavlin Radoslavov, ICSI, October 2002
   15:  *
   16:  * MROUTING Revision: 3.5
   17:  * and PIM-SMv2 and PIM-DM support, advanced API support,
   18:  * bandwidth metering and signaling
   19:  *
   20:  * $FreeBSD: src/sys/netinet/ip_mroute.c,v 1.56.2.10 2003/08/24 21:37:34 hsu Exp $
   21:  * $DragonFly: src/sys/net/ip_mroute/ip_mroute.c,v 1.9 2004/03/08 07:43:44 hsu Exp $
   22:  */
   23: 
   24: #include "opt_mrouting.h"
   25: #include "opt_random_ip_id.h"
   26: 
   27: #ifdef PIM
   28: #define _PIM_VT 1
   29: #endif
   30: 
   31: #include <sys/param.h>
   32: #include <sys/kernel.h>
   33: #include <sys/malloc.h>
   34: #include <sys/mbuf.h>
   35: #include <sys/protosw.h>
   36: #include <sys/socket.h>
   37: #include <sys/socketvar.h>
   38: #include <sys/sockio.h>
   39: #include <sys/sysctl.h>
   40: #include <sys/syslog.h>
   41: #include <sys/systm.h>
   42: #include <sys/time.h>
   43: #include <sys/in_cksum.h>
   44: #include <net/if.h>
   45: #include <net/netisr.h>
   46: #include <net/route.h>
   47: #include <netinet/in.h>
   48: #include <netinet/igmp.h>
   49: #include <netinet/in_systm.h>
   50: #include <netinet/in_var.h>
   51: #include <netinet/ip.h>
   52: #include "ip_mroute.h"
   53: #include <netinet/ip_var.h>
   54: #ifdef PIM
   55: #include <netinet/pim.h>
   56: #include <netinet/pim_var.h>
   57: #endif
   58: #include <netinet/udp.h>
   59: 
   60: /*
   61:  * Control debugging code for rsvp and multicast routing code.
   62:  * Can only set them with the debugger.
   63:  */
   64: static	u_int	rsvpdebug;		/* non-zero enables debugging   */
   65: 
   66: static	u_int	mrtdebug;		/* any set of the flags below   */
   67:  
   68: #define		DEBUG_MFC	0x02
   69: #define		DEBUG_FORWARD	0x04
   70: #define		DEBUG_EXPIRE	0x08
   71: #define		DEBUG_XMIT	0x10
   72: #define		DEBUG_PIM	0x20
   73: 
   74: #define		VIFI_INVALID	((vifi_t) -1)
   75: 
   76: #define M_HASCL(m)	((m)->m_flags & M_EXT)
   77: 
   78: static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables");
   79: 
   80: static struct mrtstat	mrtstat;
   81: SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
   82:     &mrtstat, mrtstat,
   83:     "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)");
   84: 
   85: static struct mfc	*mfctable[MFCTBLSIZ];
   86: SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
   87:     &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]",
   88:     "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)");
   89: 
   90: static struct vif	viftable[MAXVIFS];
   91: SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD,
   92:     &viftable, sizeof(viftable), "S,vif[MAXVIFS]",
   93:     "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
   94: 
   95: static u_char		nexpire[MFCTBLSIZ];
   96: 
   97: static struct callout_handle expire_upcalls_ch;
   98: 
   99: #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
  100: #define		UPCALL_EXPIRE	6		/* number of timeouts	*/
  101: 
  102: /*
  103:  * Define the token bucket filter structures
  104:  * tbftable -> each vif has one of these for storing info
  105:  */
  106: 
  107: static struct tbf tbftable[MAXVIFS];
  108: #define		TBF_REPROCESS	(hz / 100)	/* 100x / second */
  109: 
  110: /*
  111:  * 'Interfaces' associated with decapsulator (so we can tell
  112:  * packets that went through it from ones that get reflected
  113:  * by a broken gateway).  These interfaces are never linked into
  114:  * the system ifnet list & no routes point to them.  I.e., packets
  115:  * can't be sent this way.  They only exist as a placeholder for
  116:  * multicast source verification.
  117:  */
  118: static struct ifnet multicast_decap_if[MAXVIFS];
  119: 
  120: #define ENCAP_TTL 64
  121: #define ENCAP_PROTO IPPROTO_IPIP	/* 4 */
  122: 
  123: /* prototype IP hdr for encapsulated packets */
  124: static struct ip multicast_encap_iphdr = {
  125: #if BYTE_ORDER == LITTLE_ENDIAN
  126: 	sizeof(struct ip) >> 2, IPVERSION,
  127: #else
  128: 	IPVERSION, sizeof(struct ip) >> 2,
  129: #endif
  130: 	0,				/* tos */
  131: 	sizeof(struct ip),		/* total length */
  132: 	0,				/* id */
  133: 	0,				/* frag offset */
  134: 	ENCAP_TTL, ENCAP_PROTO,
  135: 	0,				/* checksum */
  136: };
  137: 
  138: /*
  139:  * Bandwidth meter variables and constants
  140:  */
  141: static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
  142: /*
  143:  * Pending timeouts are stored in a hash table, the key being the
  144:  * expiration time. Periodically, the entries are analysed and processed.
  145:  */
  146: #define BW_METER_BUCKETS	1024
  147: static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
  148: static struct callout_handle bw_meter_ch;
  149: #define BW_METER_PERIOD (hz)		/* periodical handling of bw meters */
  150: 
  151: /*
  152:  * Pending upcalls are stored in a vector which is flushed when
  153:  * full, or periodically
  154:  */
  155: static struct bw_upcall	bw_upcalls[BW_UPCALLS_MAX];
  156: static u_int	bw_upcalls_n; /* # of pending upcalls */
  157: static struct callout_handle bw_upcalls_ch;
  158: #define BW_UPCALLS_PERIOD (hz)		/* periodical flush of bw upcalls */
  159: 
  160: #ifdef PIM
  161: static struct pimstat pimstat;
  162: SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD,
  163:     &pimstat, pimstat,
  164:     "PIM Statistics (struct pimstat, netinet/pim_var.h)");
  165: 
  166: /*
  167:  * Note: the PIM Register encapsulation adds the following in front of a
  168:  * data packet:
  169:  *
  170:  * struct pim_encap_hdr {
  171:  *    struct ip ip;
  172:  *    struct pim_encap_pimhdr  pim;
  173:  * }
  174:  *
  175:  */
  176: 
  177: struct pim_encap_pimhdr {
  178: 	struct pim pim;
  179: 	uint32_t   flags;
  180: };
  181: 
  182: static struct ip pim_encap_iphdr = {
  183: #if BYTE_ORDER == LITTLE_ENDIAN
  184: 	sizeof(struct ip) >> 2,
  185: 	IPVERSION,
  186: #else
  187: 	IPVERSION,
  188: 	sizeof(struct ip) >> 2,
  189: #endif
  190: 	0,			/* tos */
  191: 	sizeof(struct ip),	/* total length */
  192: 	0,			/* id */
  193: 	0,			/* frag offset */ 
  194: 	ENCAP_TTL,
  195: 	IPPROTO_PIM,
  196: 	0,			/* checksum */
  197: };
  198: 
  199: static struct pim_encap_pimhdr pim_encap_pimhdr = {
  200:     {
  201: 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
  202: 	0,			/* reserved */
  203: 	0,			/* checksum */
  204:     },
  205:     0				/* flags */
  206: };
  207: 
  208: static struct ifnet multicast_register_if;
  209: static vifi_t reg_vif_num = VIFI_INVALID;
  210: #endif /* PIM */
  211: 
  212: /*
  213:  * Private variables.
  214:  */
  215: static vifi_t	   numvifs;
  216: static int have_encap_tunnel;
  217: 
  218: /*
  219:  * one-back cache used by ipip_input to locate a tunnel's vif
  220:  * given a datagram's src ip address.
  221:  */
  222: static u_long last_encap_src;
  223: static struct vif *last_encap_vif;
  224: 
  225: static u_long	X_ip_mcast_src(int vifi);
  226: static int	X_ip_mforward(struct ip *ip, struct ifnet *ifp,
  227: 			struct mbuf *m, struct ip_moptions *imo);
  228: static int	X_ip_mrouter_done(void);
  229: static int	X_ip_mrouter_get(struct socket *so, struct sockopt *m);
  230: static int	X_ip_mrouter_set(struct socket *so, struct sockopt *m);
  231: static int	X_legal_vif_num(int vif);
  232: static int	X_mrt_ioctl(int cmd, caddr_t data);
  233: 
  234: static int get_sg_cnt(struct sioc_sg_req *);
  235: static int get_vif_cnt(struct sioc_vif_req *);
  236: static int ip_mrouter_init(struct socket *, int);
  237: static int add_vif(struct vifctl *);
  238: static int del_vif(vifi_t);
  239: static int add_mfc(struct mfcctl2 *);
  240: static int del_mfc(struct mfcctl2 *);
  241: static int set_api_config(uint32_t *); /* chose API capabilities */
  242: static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
  243: static int set_assert(int);
  244: static void expire_upcalls(void *);
  245: static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
  246: static void phyint_send(struct ip *, struct vif *, struct mbuf *);
  247: static void encap_send(struct ip *, struct vif *, struct mbuf *);
  248: static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long);
  249: static void tbf_queue(struct vif *, struct mbuf *);
  250: static void tbf_process_q(struct vif *);
  251: static void tbf_reprocess_q(void *);
  252: static int tbf_dq_sel(struct vif *, struct ip *);
  253: static void tbf_send_packet(struct vif *, struct mbuf *);
  254: static void tbf_update_tokens(struct vif *);
  255: static int priority(struct vif *, struct ip *);
  256: 
  257: /*
  258:  * Bandwidth monitoring
  259:  */
  260: static void free_bw_list(struct bw_meter *list);
  261: static int add_bw_upcall(struct bw_upcall *);
  262: static int del_bw_upcall(struct bw_upcall *);
  263: static void bw_meter_receive_packet(struct bw_meter *x, int plen,
  264: 		struct timeval *nowp);
  265: static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp);
  266: static void bw_upcalls_send(void);
  267: static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp);
  268: static void unschedule_bw_meter(struct bw_meter *x);
  269: static void bw_meter_process(void);
  270: static void expire_bw_upcalls_send(void *);
  271: static void expire_bw_meter_process(void *);
  272: 
  273: #ifdef PIM
  274: static int pim_register_send(struct ip *, struct vif *,
  275: 		struct mbuf *, struct mfc *);
  276: static int pim_register_send_rp(struct ip *, struct vif *,
  277: 		struct mbuf *, struct mfc *);
  278: static int pim_register_send_upcall(struct ip *, struct vif *,
  279: 		struct mbuf *, struct mfc *);
  280: static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
  281: #endif
  282: 
  283: /*
  284:  * whether or not special PIM assert processing is enabled.
  285:  */
  286: static int pim_assert;
  287: /*
  288:  * Rate limit for assert notification messages, in usec
  289:  */
  290: #define ASSERT_MSG_TIME		3000000
  291: 
  292: /*
  293:  * Kernel multicast routing API capabilities and setup.
  294:  * If more API capabilities are added to the kernel, they should be
  295:  * recorded in `mrt_api_support'.
  296:  */
  297: static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
  298: 					 MRT_MFC_FLAGS_BORDER_VIF |
  299: 					 MRT_MFC_RP |
  300: 					 MRT_MFC_BW_UPCALL);
  301: static uint32_t mrt_api_config = 0;
  302: 
  303: /*
  304:  * Hash function for a source, group entry
  305:  */
  306: #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
  307: 			((g) >> 20) ^ ((g) >> 10) ^ (g))
  308: 
  309: /*
  310:  * Find a route for a given origin IP address and Multicast group address
  311:  * Type of service parameter to be added in the future!!!
  312:  * Statistics are updated by the caller if needed
  313:  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
  314:  */
  315: static struct mfc *
  316: mfc_find(in_addr_t o, in_addr_t g)
  317: {
  318:     struct mfc *rt;
  319: 
  320:     for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next)
  321: 	if ((rt->mfc_origin.s_addr == o) &&
  322: 		(rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL))
  323: 	    break;
  324:     return rt;
  325: }
  326: 
  327: /*
  328:  * Macros to compute elapsed time efficiently
  329:  * Borrowed from Van Jacobson's scheduling code
  330:  */
  331: #define TV_DELTA(a, b, delta) {					\
  332: 	int xxs;						\
  333: 	delta = (a).tv_usec - (b).tv_usec;			\
  334: 	if ((xxs = (a).tv_sec - (b).tv_sec)) {			\
  335: 		switch (xxs) {					\
  336: 		case 2:						\
  337: 			delta += 1000000;			\
  338: 			/* FALLTHROUGH */			\
  339: 		case 1:						\
  340: 			delta += 1000000;			\
  341: 			break;					\
  342: 		default:					\
  343: 			delta += (1000000 * xxs);		\
  344: 		}						\
  345: 	}							\
  346: }
  347: 
  348: #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
  349: 	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
  350: 
  351: /*
  352:  * Handle MRT setsockopt commands to modify the multicast routing tables.
  353:  */
  354: static int
  355: X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
  356: {
  357:     int	error, optval;
  358:     vifi_t	vifi;
  359:     struct	vifctl vifc;
  360:     struct	mfcctl2 mfc;
  361:     struct	bw_upcall bw_upcall;
  362:     uint32_t	i;
  363: 
  364:     if (so != ip_mrouter && sopt->sopt_name != MRT_INIT)
  365: 	return EPERM;
  366: 
  367:     error = 0;
  368:     switch (sopt->sopt_name) {
  369:     case MRT_INIT:
  370: 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
  371: 	if (error)
  372: 	    break;
  373: 	error = ip_mrouter_init(so, optval);
  374: 	break;
  375: 
  376:     case MRT_DONE:
  377: 	error = ip_mrouter_done();
  378: 	break;
  379: 
  380:     case MRT_ADD_VIF:
  381: 	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
  382: 	if (error)
  383: 	    break;
  384: 	error = add_vif(&vifc);
  385: 	break;
  386: 
  387:     case MRT_DEL_VIF:
  388: 	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
  389: 	if (error)
  390: 	    break;
  391: 	error = del_vif(vifi);
  392: 	break;
  393: 
  394:     case MRT_ADD_MFC:
  395:     case MRT_DEL_MFC:
  396: 	/*
  397: 	 * select data size depending on API version.
  398: 	 */
  399: 	if (sopt->sopt_name == MRT_ADD_MFC &&
  400: 		mrt_api_config & MRT_API_FLAGS_ALL) {
  401: 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
  402: 				sizeof(struct mfcctl2));
  403: 	} else {
  404: 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
  405: 				sizeof(struct mfcctl));
  406: 	    bzero((caddr_t)&mfc + sizeof(struct mfcctl),
  407: 			sizeof(mfc) - sizeof(struct mfcctl));
  408: 	}
  409: 	if (error)
  410: 	    break;
  411: 	if (sopt->sopt_name == MRT_ADD_MFC)
  412: 	    error = add_mfc(&mfc);
  413: 	else
  414: 	    error = del_mfc(&mfc);
  415: 	break;
  416: 
  417:     case MRT_ASSERT:
  418: 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
  419: 	if (error)
  420: 	    break;
  421: 	set_assert(optval);
  422: 	break;
  423: 
  424:     case MRT_API_CONFIG:
  425: 	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
  426: 	if (!error)
  427: 	    error = set_api_config(&i);
  428: 	if (!error)
  429: 	    error = sooptcopyout(sopt, &i, sizeof i);
  430: 	break;
  431: 
  432:     case MRT_ADD_BW_UPCALL:
  433:     case MRT_DEL_BW_UPCALL:
  434: 	error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
  435: 				sizeof bw_upcall);
  436: 	if (error)
  437: 	    break;
  438: 	if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
  439: 	    error = add_bw_upcall(&bw_upcall);
  440: 	else
  441: 	    error = del_bw_upcall(&bw_upcall);
  442: 	break;
  443: 
  444:     default:
  445: 	error = EOPNOTSUPP;
  446: 	break;
  447:     }
  448:     return error;
  449: }
  450: 
  451: /*
  452:  * Handle MRT getsockopt commands
  453:  */
  454: static int
  455: X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
  456: {
  457:     int error;
  458:     static int version = 0x0305; /* !!! why is this here? XXX */
  459: 
  460:     switch (sopt->sopt_name) {
  461:     case MRT_VERSION:
  462: 	error = sooptcopyout(sopt, &version, sizeof version);
  463: 	break;
  464: 
  465:     case MRT_ASSERT:
  466: 	error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert);
  467: 	break;
  468: 
  469:     case MRT_API_SUPPORT:
  470: 	error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
  471: 	break;
  472: 
  473:     case MRT_API_CONFIG:
  474: 	error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config);
  475: 	break;
  476: 
  477:     default:
  478: 	error = EOPNOTSUPP;
  479: 	break;
  480:     }
  481:     return error;
  482: }
  483: 
  484: /*
  485:  * Handle ioctl commands to obtain information from the cache
  486:  */
  487: static int
  488: X_mrt_ioctl(int cmd, caddr_t data)
  489: {
  490:     int error = 0;
  491: 
  492:     switch (cmd) {
  493:     case SIOCGETVIFCNT:
  494: 	error = get_vif_cnt((struct sioc_vif_req *)data);
  495: 	break;
  496: 
  497:     case SIOCGETSGCNT:
  498: 	error = get_sg_cnt((struct sioc_sg_req *)data);
  499: 	break;
  500: 
  501:     default:
  502: 	error = EINVAL;
  503: 	break;
  504:     }
  505:     return error;
  506: }
  507: 
  508: /*
  509:  * returns the packet, byte, rpf-failure count for the source group provided
  510:  */
  511: static int
  512: get_sg_cnt(struct sioc_sg_req *req)
  513: {
  514:     int s;
  515:     struct mfc *rt;
  516: 
  517:     s = splnet();
  518:     rt = mfc_find(req->src.s_addr, req->grp.s_addr);
  519:     splx(s);
  520:     if (rt == NULL) {
  521: 	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
  522: 	return EADDRNOTAVAIL;
  523:     }
  524:     req->pktcnt = rt->mfc_pkt_cnt;
  525:     req->bytecnt = rt->mfc_byte_cnt;
  526:     req->wrong_if = rt->mfc_wrong_if;
  527:     return 0;
  528: }
  529: 
  530: /*
  531:  * returns the input and output packet and byte counts on the vif provided
  532:  */
  533: static int
  534: get_vif_cnt(struct sioc_vif_req *req)
  535: {
  536:     vifi_t vifi = req->vifi;
  537: 
  538:     if (vifi >= numvifs)
  539: 	return EINVAL;
  540: 
  541:     req->icount = viftable[vifi].v_pkt_in;
  542:     req->ocount = viftable[vifi].v_pkt_out;
  543:     req->ibytes = viftable[vifi].v_bytes_in;
  544:     req->obytes = viftable[vifi].v_bytes_out;
  545: 
  546:     return 0;
  547: }
  548: 
  549: /*
  550:  * Enable multicast routing
  551:  */
  552: static int
  553: ip_mrouter_init(struct socket *so, int version)
  554: {
  555:     if (mrtdebug)
  556: 	log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
  557: 	    so->so_type, so->so_proto->pr_protocol);
  558: 
  559:     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
  560: 	return EOPNOTSUPP;
  561: 
  562:     if (version != 1)
  563: 	return ENOPROTOOPT;
  564: 
  565:     if (ip_mrouter != NULL)
  566: 	return EADDRINUSE;
  567: 
  568:     ip_mrouter = so;
  569: 
  570:     bzero((caddr_t)mfctable, sizeof(mfctable));
  571:     bzero((caddr_t)nexpire, sizeof(nexpire));
  572: 
  573:     pim_assert = 0;
  574: 
  575:     expire_upcalls_ch = timeout(expire_upcalls, NULL, EXPIRE_TIMEOUT);
  576: 
  577:     bw_upcalls_n = 0;
  578:     bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers));
  579:     bw_upcalls_ch = timeout(expire_bw_upcalls_send, NULL, BW_UPCALLS_PERIOD);
  580:     bw_meter_ch = timeout(expire_bw_meter_process, NULL, BW_METER_PERIOD);
  581: 
  582:     mrt_api_config = 0;
  583: 
  584:     if (mrtdebug)
  585: 	log(LOG_DEBUG, "ip_mrouter_init\n");
  586: 
  587:     return 0;
  588: }
  589: 
  590: /*
  591:  * Disable multicast routing
  592:  */
  593: static int
  594: X_ip_mrouter_done(void)
  595: {
  596:     vifi_t vifi;
  597:     int i;
  598:     struct ifnet *ifp;
  599:     struct ifreq ifr;
  600:     struct mfc *rt;
  601:     struct rtdetq *rte;
  602:     int s;
  603: 
  604:     s = splnet();
  605: 
  606:     /*
  607:      * For each phyint in use, disable promiscuous reception of all IP
  608:      * multicasts.
  609:      */
  610:     for (vifi = 0; vifi < numvifs; vifi++) {
  611: 	if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
  612: 		!(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
  613: 	    struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr);
  614: 
  615: 	    so->sin_len = sizeof(struct sockaddr_in);
  616: 	    so->sin_family = AF_INET;
  617: 	    so->sin_addr.s_addr = INADDR_ANY;
  618: 	    ifp = viftable[vifi].v_ifp;
  619: 	    if_allmulti(ifp, 0);
  620: 	}
  621:     }
  622:     bzero((caddr_t)tbftable, sizeof(tbftable));
  623:     bzero((caddr_t)viftable, sizeof(viftable));
  624:     numvifs = 0;
  625:     pim_assert = 0;
  626: 
  627:     untimeout(expire_upcalls, NULL, expire_upcalls_ch);
  628: 
  629:     mrt_api_config = 0;
  630:     bw_upcalls_n = 0;
  631:     untimeout(expire_bw_upcalls_send, NULL, bw_upcalls_ch);
  632:     untimeout(expire_bw_meter_process, NULL, bw_meter_ch);
  633: 
  634:     /*
  635:      * Free all multicast forwarding cache entries.
  636:      */
  637:     for (i = 0; i < MFCTBLSIZ; i++) {
  638: 	for (rt = mfctable[i]; rt != NULL; ) {
  639: 	    struct mfc *nr = rt->mfc_next;
  640: 
  641: 	    for (rte = rt->mfc_stall; rte != NULL; ) {
  642: 		struct rtdetq *n = rte->next;
  643: 
  644: 		m_freem(rte->m);
  645: 		free(rte, M_MRTABLE);
  646: 		rte = n;
  647: 	    }
  648: 	    free_bw_list(rt->mfc_bw_meter);
  649: 	    free(rt, M_MRTABLE);
  650: 	    rt = nr;
  651: 	}
  652:     }
  653: 
  654:     bzero((caddr_t)mfctable, sizeof(mfctable));
  655: 
  656:     bzero(bw_meter_timers, sizeof(bw_meter_timers));
  657: 
  658:     /*
  659:      * Reset de-encapsulation cache
  660:      */
  661:     last_encap_src = INADDR_ANY;
  662:     last_encap_vif = NULL;
  663: #ifdef PIM
  664:     reg_vif_num = VIFI_INVALID;
  665: #endif
  666:     have_encap_tunnel = 0;
  667: 
  668:     ip_mrouter = NULL;
  669: 
  670:     splx(s);
  671: 
  672:     if (mrtdebug)
  673: 	log(LOG_DEBUG, "ip_mrouter_done\n");
  674: 
  675:     return 0;
  676: }
  677: 
  678: /*
  679:  * Set PIM assert processing global
  680:  */
  681: static int
  682: set_assert(int i)
  683: {
  684:     if ((i != 1) && (i != 0))
  685: 	return EINVAL;
  686: 
  687:     pim_assert = i;
  688: 
  689:     return 0;
  690: }
  691: 
  692: /*
  693:  * Configure API capabilities
  694:  */
  695: int
  696: set_api_config(uint32_t *apival)
  697: {
  698:     int i;
  699: 
  700:     /*
  701:      * We can set the API capabilities only if it is the first operation
  702:      * after MRT_INIT. I.e.:
  703:      *  - there are no vifs installed
  704:      *  - pim_assert is not enabled
  705:      *  - the MFC table is empty
  706:      */
  707:     if (numvifs > 0) {
  708: 	*apival = 0;
  709: 	return EPERM;
  710:     }
  711:     if (pim_assert) {
  712: 	*apival = 0;
  713: 	return EPERM;
  714:     }
  715:     for (i = 0; i < MFCTBLSIZ; i++) {
  716: 	if (mfctable[i] != NULL) {
  717: 	    *apival = 0;
  718: 	    return EPERM;
  719: 	}
  720:     }
  721: 
  722:     mrt_api_config = *apival & mrt_api_support;
  723:     *apival = mrt_api_config;
  724: 
  725:     return 0;
  726: }
  727: 
  728: /*
  729:  * Add a vif to the vif table
  730:  */
  731: static int
  732: add_vif(struct vifctl *vifcp)
  733: {
  734:     struct vif *vifp = viftable + vifcp->vifc_vifi;
  735:     struct sockaddr_in sin = {sizeof sin, AF_INET};
  736:     struct ifaddr *ifa;
  737:     struct ifnet *ifp;
  738:     int error, s;
  739:     struct tbf *v_tbf = tbftable + vifcp->vifc_vifi;
  740: 
  741:     if (vifcp->vifc_vifi >= MAXVIFS)
  742: 	return EINVAL;
  743:     if (vifp->v_lcl_addr.s_addr != INADDR_ANY)
  744: 	return EADDRINUSE;
  745:     if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY)
  746: 	return EADDRNOTAVAIL;
  747: 
  748:     /* Find the interface with an address in AF_INET family */
  749: #ifdef PIM
  750:     if (vifcp->vifc_flags & VIFF_REGISTER) {
  751: 	/*
  752: 	 * XXX: Because VIFF_REGISTER does not really need a valid
  753: 	 * local interface (e.g. it could be 127.0.0.2), we don't
  754: 	 * check its address.
  755: 	 */
  756: 	ifp = NULL;
  757:     } else
  758: #endif
  759:     {
  760: 	sin.sin_addr = vifcp->vifc_lcl_addr;
  761: 	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
  762: 	if (ifa == NULL)
  763: 	    return EADDRNOTAVAIL;
  764: 	ifp = ifa->ifa_ifp;
  765:     }
  766: 
  767:     if (vifcp->vifc_flags & VIFF_TUNNEL) {
  768: 	if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) {
  769: 	    /*
  770: 	     * An encapsulating tunnel is wanted.  Tell ipip_input() to
  771: 	     * start paying attention to encapsulated packets.
  772: 	     */
  773: 	    if (have_encap_tunnel == 0) {
  774: 		have_encap_tunnel = 1;
  775: 		for (s = 0; s < MAXVIFS; ++s) {
  776: 		    if_initname(&multicast_decap_if[s], "mdecap", s);
  777: 		}
  778: 	    }
  779: 	    /*
  780: 	     * Set interface to fake encapsulator interface
  781: 	     */
  782: 	    ifp = &multicast_decap_if[vifcp->vifc_vifi];
  783: 	    /*
  784: 	     * Prepare cached route entry
  785: 	     */
  786: 	    bzero(&vifp->v_route, sizeof(vifp->v_route));
  787: 	} else {
  788: 	    log(LOG_ERR, "source routed tunnels not supported\n");
  789: 	    return EOPNOTSUPP;
  790: 	}
  791: #ifdef PIM
  792:     } else if (vifcp->vifc_flags & VIFF_REGISTER) {
  793: 	ifp = &multicast_register_if;
  794: 	if (mrtdebug)
  795: 	    log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
  796: 		    (void *)&multicast_register_if);
  797: 	if (reg_vif_num == VIFI_INVALID) {
  798: 	    if_initname(&multicast_register_if, "register_vif", 0);
  799: 	    multicast_register_if.if_flags = IFF_LOOPBACK;
  800: 	    bzero(&vifp->v_route, sizeof(vifp->v_route));
  801: 	    reg_vif_num = vifcp->vifc_vifi;
  802: 	}
  803: #endif
  804:     } else {		/* Make sure the interface supports multicast */
  805: 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
  806: 	    return EOPNOTSUPP;
  807: 
  808: 	/* Enable promiscuous reception of all IP multicasts from the if */
  809: 	s = splnet();
  810: 	error = if_allmulti(ifp, 1);
  811: 	splx(s);
  812: 	if (error)
  813: 	    return error;
  814:     }
  815: 
  816:     s = splnet();
  817:     /* define parameters for the tbf structure */
  818:     vifp->v_tbf = v_tbf;
  819:     GET_TIME(vifp->v_tbf->tbf_last_pkt_t);
  820:     vifp->v_tbf->tbf_n_tok = 0;
  821:     vifp->v_tbf->tbf_q_len = 0;
  822:     vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
  823:     vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
  824: 
  825:     vifp->v_flags     = vifcp->vifc_flags;
  826:     vifp->v_threshold = vifcp->vifc_threshold;
  827:     vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
  828:     vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
  829:     vifp->v_ifp       = ifp;
  830:     /* scaling up here allows division by 1024 in critical code */
  831:     vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000;
  832:     vifp->v_rsvp_on   = 0;
  833:     vifp->v_rsvpd     = NULL;
  834:     /* initialize per vif pkt counters */
  835:     vifp->v_pkt_in    = 0;
  836:     vifp->v_pkt_out   = 0;
  837:     vifp->v_bytes_in  = 0;
  838:     vifp->v_bytes_out = 0;
  839:     splx(s);
  840: 
  841:     /* Adjust numvifs up if the vifi is higher than numvifs */
  842:     if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1;
  843: 
  844:     if (mrtdebug)
  845: 	log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n",
  846: 	    vifcp->vifc_vifi,
  847: 	    (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr),
  848: 	    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
  849: 	    (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr),
  850: 	    vifcp->vifc_threshold,
  851: 	    vifcp->vifc_rate_limit);
  852: 
  853:     return 0;
  854: }
  855: 
  856: /*
  857:  * Delete a vif from the vif table
  858:  */
  859: static int
  860: del_vif(vifi_t vifi)
  861: {
  862:     struct vif *vifp;
  863:     int s;
  864: 
  865:     if (vifi >= numvifs)
  866: 	return EINVAL;
  867:     vifp = &viftable[vifi];
  868:     if (vifp->v_lcl_addr.s_addr == INADDR_ANY)
  869: 	return EADDRNOTAVAIL;
  870: 
  871:     s = splnet();
  872: 
  873:     if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
  874: 	if_allmulti(vifp->v_ifp, 0);
  875: 
  876:     if (vifp == last_encap_vif) {
  877: 	last_encap_vif = NULL;
  878: 	last_encap_src = INADDR_ANY;
  879:     }
  880: 
  881:     /*
  882:      * Free packets queued at the interface
  883:      */
  884:     while (vifp->v_tbf->tbf_q) {
  885: 	struct mbuf *m = vifp->v_tbf->tbf_q;
  886: 
  887: 	vifp->v_tbf->tbf_q = m->m_act;
  888: 	m_freem(m);
  889:     }
  890: 
  891: #ifdef PIM
  892:     if (vifp->v_flags & VIFF_REGISTER)
  893: 	reg_vif_num = VIFI_INVALID;
  894: #endif
  895: 
  896:     bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf)));
  897:     bzero((caddr_t)vifp, sizeof (*vifp));
  898: 
  899:     if (mrtdebug)
  900: 	log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs);
  901: 
  902:     /* Adjust numvifs down */
  903:     for (vifi = numvifs; vifi > 0; vifi--)
  904: 	if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY)
  905: 	    break;
  906:     numvifs = vifi;
  907: 
  908:     splx(s);
  909: 
  910:     return 0;
  911: }
  912: 
  913: /*
  914:  * update an mfc entry without resetting counters and S,G addresses.
  915:  */
  916: static void
  917: update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
  918: {
  919:     int i;
  920: 
  921:     rt->mfc_parent = mfccp->mfcc_parent;
  922:     for (i = 0; i < numvifs; i++) {
  923: 	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
  924: 	rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
  925: 	    MRT_MFC_FLAGS_ALL;
  926:     }
  927:     /* set the RP address */
  928:     if (mrt_api_config & MRT_MFC_RP)
  929: 	rt->mfc_rp = mfccp->mfcc_rp;
  930:     else
  931: 	rt->mfc_rp.s_addr = INADDR_ANY;
  932: }
  933: 
  934: /*
  935:  * fully initialize an mfc entry from the parameter.
  936:  */
  937: static void
  938: init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
  939: {
  940:     rt->mfc_origin     = mfccp->mfcc_origin;
  941:     rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
  942: 
  943:     update_mfc_params(rt, mfccp);
  944: 
  945:     /* initialize pkt counters per src-grp */
  946:     rt->mfc_pkt_cnt    = 0;
  947:     rt->mfc_byte_cnt   = 0;
  948:     rt->mfc_wrong_if   = 0;
  949:     rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
  950: }
  951: 
  952: 
  953: /*
  954:  * Add an mfc entry
  955:  */
  956: static int
  957: add_mfc(struct mfcctl2 *mfccp)
  958: {
  959:     struct mfc *rt;
  960:     u_long hash;
  961:     struct rtdetq *rte;
  962:     u_short nstl;
  963:     int s;
  964: 
  965:     rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
  966: 
  967:     /* If an entry already exists, just update the fields */
  968:     if (rt) {
  969: 	if (mrtdebug & DEBUG_MFC)
  970: 	    log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n",
  971: 		(u_long)ntohl(mfccp->mfcc_origin.s_addr),
  972: 		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
  973: 		mfccp->mfcc_parent);
  974: 
  975: 	s = splnet();
  976: 	update_mfc_params(rt, mfccp);
  977: 	splx(s);
  978: 	return 0;
  979:     }
  980: 
  981:     /*
  982:      * Find the entry for which the upcall was made and update
  983:      */
  984:     s = splnet();
  985:     hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
  986:     for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) {
  987: 
  988: 	if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
  989: 		(rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
  990: 		(rt->mfc_stall != NULL)) {
  991: 
  992: 	    if (nstl++)
  993: 		log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n",
  994: 		    "multiple kernel entries",
  995: 		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
  996: 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
  997: 		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
  998: 
  999: 	    if (mrtdebug & DEBUG_MFC)
 1000: 		log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n",
 1001: 		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 1002: 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1003: 		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
 1004: 
 1005: 	    init_mfc_params(rt, mfccp);
 1006: 
 1007: 	    rt->mfc_expire = 0;	/* Don't clean this guy up */
 1008: 	    nexpire[hash]--;
 1009: 
 1010: 	    /* free packets Qed at the end of this entry */
 1011: 	    for (rte = rt->mfc_stall; rte != NULL; ) {
 1012: 		struct rtdetq *n = rte->next;
 1013: 
 1014: 		ip_mdq(rte->m, rte->ifp, rt, -1);
 1015: 		m_freem(rte->m);
 1016: 		free(rte, M_MRTABLE);
 1017: 		rte = n;
 1018: 	    }
 1019: 	    rt->mfc_stall = NULL;
 1020: 	}
 1021:     }
 1022: 
 1023:     /*
 1024:      * It is possible that an entry is being inserted without an upcall
 1025:      */
 1026:     if (nstl == 0) {
 1027: 	if (mrtdebug & DEBUG_MFC)
 1028: 	    log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n",
 1029: 		hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr),
 1030: 		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 1031: 		mfccp->mfcc_parent);
 1032: 
 1033: 	for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) {
 1034: 	    if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
 1035: 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) {
 1036: 		init_mfc_params(rt, mfccp);
 1037: 		if (rt->mfc_expire)
 1038: 		    nexpire[hash]--;
 1039: 		rt->mfc_expire = 0;
 1040: 		break; /* XXX */
 1041: 	    }
 1042: 	}
 1043: 	if (rt == NULL) {		/* no upcall, so make a new entry */
 1044: 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 1045: 	    if (rt == NULL) {
 1046: 		splx(s);
 1047: 		return ENOBUFS;
 1048: 	    }
 1049: 
 1050: 	    init_mfc_params(rt, mfccp);
 1051: 	    rt->mfc_expire     = 0;
 1052: 	    rt->mfc_stall      = NULL;
 1053: 
 1054: 	    rt->mfc_bw_meter = NULL;
 1055: 	    /* insert new entry at head of hash chain */
 1056: 	    rt->mfc_next = mfctable[hash];
 1057: 	    mfctable[hash] = rt;
 1058: 	}
 1059:     }
 1060:     splx(s);
 1061:     return 0;
 1062: }
 1063: 
 1064: /*
 1065:  * Delete an mfc entry
 1066:  */
 1067: static int
 1068: del_mfc(struct mfcctl2 *mfccp)
 1069: {
 1070:     struct in_addr 	origin;
 1071:     struct in_addr 	mcastgrp;
 1072:     struct mfc 		*rt;
 1073:     struct mfc	 	**nptr;
 1074:     u_long 		hash;
 1075:     int s;
 1076:     struct bw_meter	*list;
 1077: 
 1078:     origin = mfccp->mfcc_origin;
 1079:     mcastgrp = mfccp->mfcc_mcastgrp;
 1080: 
 1081:     if (mrtdebug & DEBUG_MFC)
 1082: 	log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n",
 1083: 	    (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
 1084: 
 1085:     s = splnet();
 1086: 
 1087:     hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
 1088:     for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next)
 1089: 	if (origin.s_addr == rt->mfc_origin.s_addr &&
 1090: 		mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
 1091: 		rt->mfc_stall == NULL)
 1092: 	    break;
 1093:     if (rt == NULL) {
 1094: 	splx(s);
 1095: 	return EADDRNOTAVAIL;
 1096:     }
 1097: 
 1098:     *nptr = rt->mfc_next;
 1099: 
 1100:     /*
 1101:      * free the bw_meter entries
 1102:      */
 1103:     list = rt->mfc_bw_meter;
 1104:     rt->mfc_bw_meter = NULL;
 1105: 
 1106:     free(rt, M_MRTABLE);
 1107: 
 1108:     splx(s);
 1109: 
 1110:     free_bw_list(list);
 1111: 
 1112:     return 0;
 1113: }
 1114: 
 1115: /*
 1116:  * Send a message to mrouted on the multicast routing socket
 1117:  */
 1118: static int
 1119: socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 1120: {
 1121:     if (s) {
 1122: 	if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) {
 1123: 	    sorwakeup(s);
 1124: 	    return 0;
 1125: 	}
 1126:     }
 1127:     m_freem(mm);
 1128:     return -1;
 1129: }
 1130: 
 1131: /*
 1132:  * IP multicast forwarding function. This function assumes that the packet
 1133:  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
 1134:  * pointed to by "ifp", and the packet is to be relayed to other networks
 1135:  * that have members of the packet's destination IP multicast group.
 1136:  *
 1137:  * The packet is returned unscathed to the caller, unless it is
 1138:  * erroneous, in which case a non-zero return value tells the caller to
 1139:  * discard it.
 1140:  */
 1141: 
 1142: #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 1143: 
 1144: static int
 1145: X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
 1146:     struct ip_moptions *imo)
 1147: {
 1148:     struct mfc *rt;
 1149:     int s;
 1150:     vifi_t vifi;
 1151: 
 1152:     if (mrtdebug & DEBUG_FORWARD)
 1153: 	log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n",
 1154: 	    (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr),
 1155: 	    (void *)ifp);
 1156: 
 1157:     if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
 1158: 		((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
 1159: 	/*
 1160: 	 * Packet arrived via a physical interface or
 1161: 	 * an encapsulated tunnel or a register_vif.
 1162: 	 */
 1163:     } else {
 1164: 	/*
 1165: 	 * Packet arrived through a source-route tunnel.
 1166: 	 * Source-route tunnels are no longer supported.
 1167: 	 */
 1168: 	static int last_log;
 1169: 	if (last_log != time_second) {
 1170: 	    last_log = time_second;
 1171: 	    log(LOG_ERR,
 1172: 		"ip_mforward: received source-routed packet from %lx\n",
 1173: 		(u_long)ntohl(ip->ip_src.s_addr));
 1174: 	}
 1175: 	return 1;
 1176:     }
 1177: 
 1178:     if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
 1179: 	if (ip->ip_ttl < 255)
 1180: 	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
 1181: 	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1182: 	    struct vif *vifp = viftable + vifi;
 1183: 
 1184: 	    printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n",
 1185: 		(long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr),
 1186: 		vifi,
 1187: 		(vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
 1188: 		vifp->v_ifp->if_xname);
 1189: 	}
 1190: 	return ip_mdq(m, ifp, NULL, vifi);
 1191:     }
 1192:     if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
 1193: 	printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n",
 1194: 	    (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr));
 1195: 	if (!imo)
 1196: 	    printf("In fact, no options were specified at all\n");
 1197:     }
 1198: 
 1199:     /*
 1200:      * Don't forward a packet with time-to-live of zero or one,
 1201:      * or a packet destined to a local-only group.
 1202:      */
 1203:     if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP)
 1204: 	return 0;
 1205: 
 1206:     /*
 1207:      * Determine forwarding vifs from the forwarding cache table
 1208:      */
 1209:     s = splnet();
 1210:     ++mrtstat.mrts_mfc_lookups;
 1211:     rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr);
 1212: 
 1213:     /* Entry exists, so forward if necessary */
 1214:     if (rt != NULL) {
 1215: 	splx(s);
 1216: 	return ip_mdq(m, ifp, rt, -1);
 1217:     } else {
 1218: 	/*
 1219: 	 * If we don't have a route for packet's origin,
 1220: 	 * Make a copy of the packet & send message to routing daemon
 1221: 	 */
 1222: 
 1223: 	struct mbuf *mb0;
 1224: 	struct rtdetq *rte;
 1225: 	u_long hash;
 1226: 	int hlen = ip->ip_hl << 2;
 1227: 
 1228: 	++mrtstat.mrts_mfc_misses;
 1229: 
 1230: 	mrtstat.mrts_no_route++;
 1231: 	if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
 1232: 	    log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n",
 1233: 		(u_long)ntohl(ip->ip_src.s_addr),
 1234: 		(u_long)ntohl(ip->ip_dst.s_addr));
 1235: 
 1236: 	/*
 1237: 	 * Allocate mbufs early so that we don't do extra work if we are
 1238: 	 * just going to fail anyway.  Make sure to pullup the header so
 1239: 	 * that other people can't step on it.
 1240: 	 */
 1241: 	rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT);
 1242: 	if (rte == NULL) {
 1243: 	    splx(s);
 1244: 	    return ENOBUFS;
 1245: 	}
 1246: 	mb0 = m_copypacket(m, M_DONTWAIT);
 1247: 	if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen))
 1248: 	    mb0 = m_pullup(mb0, hlen);
 1249: 	if (mb0 == NULL) {
 1250: 	    free(rte, M_MRTABLE);
 1251: 	    splx(s);
 1252: 	    return ENOBUFS;
 1253: 	}
 1254: 
 1255: 	/* is there an upcall waiting for this flow ? */
 1256: 	hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr);
 1257: 	for (rt = mfctable[hash]; rt; rt = rt->mfc_next) {
 1258: 	    if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) &&
 1259: 		    (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) &&
 1260: 		    (rt->mfc_stall != NULL))
 1261: 		break;
 1262: 	}
 1263: 
 1264: 	if (rt == NULL) {
 1265: 	    int i;
 1266: 	    struct igmpmsg *im;
 1267: 	    struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 1268: 	    struct mbuf *mm;
 1269: 
 1270: 	    /*
 1271: 	     * Locate the vifi for the incoming interface for this packet.
 1272: 	     * If none found, drop packet.
 1273: 	     */
 1274: 	    for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
 1275: 		;
 1276: 	    if (vifi >= numvifs)	/* vif not found, drop packet */
 1277: 		goto non_fatal;
 1278: 
 1279: 	    /* no upcall, so make a new entry */
 1280: 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 1281: 	    if (rt == NULL)
 1282: 		goto fail;
 1283: 	    /* Make a copy of the header to send to the user level process */
 1284: 	    mm = m_copy(mb0, 0, hlen);
 1285: 	    if (mm == NULL)
 1286: 		goto fail1;
 1287: 
 1288: 	    /*
 1289: 	     * Send message to routing daemon to install
 1290: 	     * a route into the kernel table
 1291: 	     */
 1292: 
 1293: 	    im = mtod(mm, struct igmpmsg *);
 1294: 	    im->im_msgtype = IGMPMSG_NOCACHE;
 1295: 	    im->im_mbz = 0;
 1296: 	    im->im_vif = vifi;
 1297: 
 1298: 	    mrtstat.mrts_upcalls++;
 1299: 
 1300: 	    k_igmpsrc.sin_addr = ip->ip_src;
 1301: 	    if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
 1302: 		log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
 1303: 		++mrtstat.mrts_upq_sockfull;
 1304: fail1:
 1305: 		free(rt, M_MRTABLE);
 1306: fail:
 1307: 		free(rte, M_MRTABLE);
 1308: 		m_freem(mb0);
 1309: 		splx(s);
 1310: 		return ENOBUFS;
 1311: 	    }
 1312: 
 1313: 	    /* insert new entry at head of hash chain */
 1314: 	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
 1315: 	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
 1316: 	    rt->mfc_expire	      = UPCALL_EXPIRE;
 1317: 	    nexpire[hash]++;
 1318: 	    for (i = 0; i < numvifs; i++) {
 1319: 		rt->mfc_ttls[i] = 0;
 1320: 		rt->mfc_flags[i] = 0;
 1321: 	    }
 1322: 	    rt->mfc_parent = -1;
 1323: 
 1324: 	    rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */
 1325: 
 1326: 	    rt->mfc_bw_meter = NULL;
 1327: 
 1328: 	    /* link into table */
 1329: 	    rt->mfc_next   = mfctable[hash];
 1330: 	    mfctable[hash] = rt;
 1331: 	    rt->mfc_stall = rte;
 1332: 
 1333: 	} else {
 1334: 	    /* determine if q has overflowed */
 1335: 	    int npkts = 0;
 1336: 	    struct rtdetq **p;
 1337: 
 1338: 	    /*
 1339: 	     * XXX ouch! we need to append to the list, but we
 1340: 	     * only have a pointer to the front, so we have to
 1341: 	     * scan the entire list every time.
 1342: 	     */
 1343: 	    for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
 1344: 		npkts++;
 1345: 
 1346: 	    if (npkts > MAX_UPQ) {
 1347: 		mrtstat.mrts_upq_ovflw++;
 1348: non_fatal:
 1349: 		free(rte, M_MRTABLE);
 1350: 		m_freem(mb0);
 1351: 		splx(s);
 1352: 		return 0;
 1353: 	    }
 1354: 
 1355: 	    /* Add this entry to the end of the queue */
 1356: 	    *p = rte;
 1357: 	}
 1358: 
 1359: 	rte->m 			= mb0;
 1360: 	rte->ifp 		= ifp;
 1361: 	rte->next		= NULL;
 1362: 
 1363: 	splx(s);
 1364: 
 1365: 	return 0;
 1366:     }
 1367: }
 1368: 
 1369: /*
 1370:  * Clean up the cache entry if upcall is not serviced
 1371:  */
 1372: static void
 1373: expire_upcalls(void *unused)
 1374: {
 1375:     struct rtdetq *rte;
 1376:     struct mfc *mfc, **nptr;
 1377:     int i;
 1378:     int s;
 1379: 
 1380:     s = splnet();
 1381:     for (i = 0; i < MFCTBLSIZ; i++) {
 1382: 	if (nexpire[i] == 0)
 1383: 	    continue;
 1384: 	nptr = &mfctable[i];
 1385: 	for (mfc = *nptr; mfc != NULL; mfc = *nptr) {
 1386: 	    /*
 1387: 	     * Skip real cache entries
 1388: 	     * Make sure it wasn't marked to not expire (shouldn't happen)
 1389: 	     * If it expires now
 1390: 	     */
 1391: 	    if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 &&
 1392: 		    --mfc->mfc_expire == 0) {
 1393: 		if (mrtdebug & DEBUG_EXPIRE)
 1394: 		    log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n",
 1395: 			(u_long)ntohl(mfc->mfc_origin.s_addr),
 1396: 			(u_long)ntohl(mfc->mfc_mcastgrp.s_addr));
 1397: 		/*
 1398: 		 * drop all the packets
 1399: 		 * free the mbuf with the pkt, if, timing info
 1400: 		 */
 1401: 		for (rte = mfc->mfc_stall; rte; ) {
 1402: 		    struct rtdetq *n = rte->next;
 1403: 
 1404: 		    m_freem(rte->m);
 1405: 		    free(rte, M_MRTABLE);
 1406: 		    rte = n;
 1407: 		}
 1408: 		++mrtstat.mrts_cache_cleanups;
 1409: 		nexpire[i]--;
 1410: 
 1411: 		/*
 1412: 		 * free the bw_meter entries
 1413: 		 */
 1414: 		while (mfc->mfc_bw_meter != NULL) {
 1415: 		    struct bw_meter *x = mfc->mfc_bw_meter;
 1416: 
 1417: 		    mfc->mfc_bw_meter = x->bm_mfc_next;
 1418: 		    free(x, M_BWMETER);
 1419: 		}
 1420: 
 1421: 		*nptr = mfc->mfc_next;
 1422: 		free(mfc, M_MRTABLE);
 1423: 	    } else {
 1424: 		nptr = &mfc->mfc_next;
 1425: 	    }
 1426: 	}
 1427:     }
 1428:     splx(s);
 1429:     expire_upcalls_ch = timeout(expire_upcalls, NULL, EXPIRE_TIMEOUT);
 1430: }
 1431: 
 1432: /*
 1433:  * Packet forwarding routine once entry in the cache is made
 1434:  */
 1435: static int
 1436: ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 1437: {
 1438:     struct ip  *ip = mtod(m, struct ip *);
 1439:     vifi_t vifi;
 1440:     int plen = ip->ip_len;
 1441: 
 1442: /*
 1443:  * Macro to send packet on vif.  Since RSVP packets don't get counted on
 1444:  * input, they shouldn't get counted on output, so statistics keeping is
 1445:  * separate.
 1446:  */
 1447: #define MC_SEND(ip,vifp,m) {				\
 1448: 		if ((vifp)->v_flags & VIFF_TUNNEL)	\
 1449: 		    encap_send((ip), (vifp), (m));	\
 1450: 		else					\
 1451: 		    phyint_send((ip), (vifp), (m));	\
 1452: }
 1453: 
 1454:     /*
 1455:      * If xmt_vif is not -1, send on only the requested vif.
 1456:      *
 1457:      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
 1458:      */
 1459:     if (xmt_vif < numvifs) {
 1460: #ifdef PIM
 1461: 	if (viftable[xmt_vif].v_flags & VIFF_REGISTER)
 1462: 	    pim_register_send(ip, viftable + xmt_vif, m, rt);
 1463:         else
 1464: #endif
 1465: 	MC_SEND(ip, viftable + xmt_vif, m);
 1466: 	return 1;
 1467:     }
 1468: 
 1469:     /*
 1470:      * Don't forward if it didn't arrive from the parent vif for its origin.
 1471:      */
 1472:     vifi = rt->mfc_parent;
 1473:     if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
 1474: 	/* came in the wrong interface */
 1475: 	if (mrtdebug & DEBUG_FORWARD)
 1476: 	    log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
 1477: 		(void *)ifp, vifi, (void *)viftable[vifi].v_ifp);
 1478: 	++mrtstat.mrts_wrong_if;
 1479: 	++rt->mfc_wrong_if;
 1480: 	/*
 1481: 	 * If we are doing PIM assert processing, send a message
 1482: 	 * to the routing daemon.
 1483: 	 *
 1484: 	 * XXX: A PIM-SM router needs the WRONGVIF detection so it
 1485: 	 * can complete the SPT switch, regardless of the type
 1486: 	 * of the iif (broadcast media, GRE tunnel, etc).
 1487: 	 */
 1488: 	if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
 1489: 	    struct timeval now;
 1490: 	    u_long delta;
 1491: 
 1492: #ifdef PIM
 1493: 	    if (ifp == &multicast_register_if)
 1494: 		pimstat.pims_rcv_registers_wrongiif++;
 1495: #endif
 1496: 
 1497: 	    /* Get vifi for the incoming packet */
 1498: 	    for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
 1499: 		;
 1500: 	    if (vifi >= numvifs)
 1501: 		return 0;	/* The iif is not found: ignore the packet. */
 1502: 
 1503: 	    if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
 1504: 		return 0;	/* WRONGVIF disabled: ignore the packet */
 1505: 
 1506: 	    GET_TIME(now);
 1507: 
 1508: 	    TV_DELTA(rt->mfc_last_assert, now, delta);
 1509: 
 1510: 	    if (delta > ASSERT_MSG_TIME) {
 1511: 		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 1512: 		struct igmpmsg *im;
 1513: 		int hlen = ip->ip_hl << 2;
 1514: 		struct mbuf *mm = m_copy(m, 0, hlen);
 1515: 
 1516: 		if (mm && (M_HASCL(mm) || mm->m_len < hlen))
 1517: 		    mm = m_pullup(mm, hlen);
 1518: 		if (mm == NULL)
 1519: 		    return ENOBUFS;
 1520: 
 1521: 		rt->mfc_last_assert = now;
 1522: 
 1523: 		im = mtod(mm, struct igmpmsg *);
 1524: 		im->im_msgtype	= IGMPMSG_WRONGVIF;
 1525: 		im->im_mbz		= 0;
 1526: 		im->im_vif		= vifi;
 1527: 
 1528: 		mrtstat.mrts_upcalls++;
 1529: 
 1530: 		k_igmpsrc.sin_addr = im->im_src;
 1531: 		if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
 1532: 		    log(LOG_WARNING,
 1533: 			"ip_mforward: ip_mrouter socket queue full\n");
 1534: 		    ++mrtstat.mrts_upq_sockfull;
 1535: 		    return ENOBUFS;
 1536: 		}
 1537: 	    }
 1538: 	}
 1539: 	return 0;
 1540:     }
 1541: 
 1542:     /* If I sourced this packet, it counts as output, else it was input. */
 1543:     if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) {
 1544: 	viftable[vifi].v_pkt_out++;
 1545: 	viftable[vifi].v_bytes_out += plen;
 1546:     } else {
 1547: 	viftable[vifi].v_pkt_in++;
 1548: 	viftable[vifi].v_bytes_in += plen;
 1549:     }
 1550:     rt->mfc_pkt_cnt++;
 1551:     rt->mfc_byte_cnt += plen;
 1552: 
 1553:     /*
 1554:      * For each vif, decide if a copy of the packet should be forwarded.
 1555:      * Forward if:
 1556:      *		- the ttl exceeds the vif's threshold
 1557:      *		- there are group members downstream on interface
 1558:      */
 1559:     for (vifi = 0; vifi < numvifs; vifi++)
 1560: 	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 1561: 	    viftable[vifi].v_pkt_out++;
 1562: 	    viftable[vifi].v_bytes_out += plen;
 1563: #ifdef PIM
 1564: 	    if (viftable[vifi].v_flags & VIFF_REGISTER)
 1565: 		pim_register_send(ip, viftable + vifi, m, rt);
 1566: 	    else
 1567: #endif
 1568: 	    MC_SEND(ip, viftable+vifi, m);
 1569: 	}
 1570: 
 1571:     /*
 1572:      * Perform upcall-related bw measuring.
 1573:      */
 1574:     if (rt->mfc_bw_meter != NULL) {
 1575: 	struct bw_meter *x;
 1576: 	struct timeval now;
 1577: 
 1578: 	GET_TIME(now);
 1579: 	for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 1580: 	    bw_meter_receive_packet(x, plen, &now);
 1581:     }
 1582: 
 1583:     return 0;
 1584: }
 1585: 
 1586: /*
 1587:  * check if a vif number is legal/ok. This is used by ip_output.
 1588:  */
 1589: static int
 1590: X_legal_vif_num(int vif)
 1591: {
 1592:     return (vif >= 0 && vif < numvifs);
 1593: }
 1594: 
 1595: /*
 1596:  * Return the local address used by this vif
 1597:  */
 1598: static u_long
 1599: X_ip_mcast_src(int vifi)
 1600: {
 1601:     if (vifi >= 0 && vifi < numvifs)
 1602: 	return viftable[vifi].v_lcl_addr.s_addr;
 1603:     else
 1604: 	return INADDR_ANY;
 1605: }
 1606: 
 1607: static void
 1608: phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1609: {
 1610:     struct mbuf *mb_copy;
 1611:     int hlen = ip->ip_hl << 2;
 1612: 
 1613:     /*
 1614:      * Make a new reference to the packet; make sure that
 1615:      * the IP header is actually copied, not just referenced,
 1616:      * so that ip_output() only scribbles on the copy.
 1617:      */
 1618:     mb_copy = m_copypacket(m, M_DONTWAIT);
 1619:     if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen))
 1620: 	mb_copy = m_pullup(mb_copy, hlen);
 1621:     if (mb_copy == NULL)
 1622: 	return;
 1623: 
 1624:     if (vifp->v_rate_limit == 0)
 1625: 	tbf_send_packet(vifp, mb_copy);
 1626:     else
 1627: 	tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len);
 1628: }
 1629: 
 1630: static void
 1631: encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 1632: {
 1633:     struct mbuf *mb_copy;
 1634:     struct ip *ip_copy;
 1635:     int i, len = ip->ip_len;
 1636: 
 1637:     /* Take care of delayed checksums */
 1638:     if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 1639: 	in_delayed_cksum(m);
 1640: 	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 1641:     }
 1642: 
 1643:     /*
 1644:      * copy the old packet & pullup its IP header into the
 1645:      * new mbuf so we can modify it.  Try to fill the new
 1646:      * mbuf since if we don't the ethernet driver will.
 1647:      */
 1648:     MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER);
 1649:     if (mb_copy == NULL)
 1650: 	return;
 1651:     mb_copy->m_data += max_linkhdr;
 1652:     mb_copy->m_len = sizeof(multicast_encap_iphdr);
 1653: 
 1654:     if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
 1655: 	m_freem(mb_copy);
 1656: 	return;
 1657:     }
 1658:     i = MHLEN - M_LEADINGSPACE(mb_copy);
 1659:     if (i > len)
 1660: 	i = len;
 1661:     mb_copy = m_pullup(mb_copy, i);
 1662:     if (mb_copy == NULL)
 1663: 	return;
 1664:     mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr);
 1665: 
 1666:     /*
 1667:      * fill in the encapsulating IP header.
 1668:      */
 1669:     ip_copy = mtod(mb_copy, struct ip *);
 1670:     *ip_copy = multicast_encap_iphdr;
 1671: #ifdef RANDOM_IP_ID
 1672:     ip_copy->ip_id = ip_randomid();
 1673: #else
 1674:     ip_copy->ip_id = htons(ip_id++);
 1675: #endif
 1676:     ip_copy->ip_len += len;
 1677:     ip_copy->ip_src = vifp->v_lcl_addr;
 1678:     ip_copy->ip_dst = vifp->v_rmt_addr;
 1679: 
 1680:     /*
 1681:      * turn the encapsulated IP header back into a valid one.
 1682:      */
 1683:     ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
 1684:     --ip->ip_ttl;
 1685:     ip->ip_len = htons(ip->ip_len);
 1686:     ip->ip_off = htons(ip->ip_off);
 1687:     ip->ip_sum = 0;
 1688:     mb_copy->m_data += sizeof(multicast_encap_iphdr);
 1689:     ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 1690:     mb_copy->m_data -= sizeof(multicast_encap_iphdr);
 1691: 
 1692:     if (vifp->v_rate_limit == 0)
 1693: 	tbf_send_packet(vifp, mb_copy);
 1694:     else
 1695: 	tbf_control(vifp, mb_copy, ip, ip_copy->ip_len);
 1696: }
 1697: 
 1698: /*
 1699:  * De-encapsulate a packet and feed it back through ip input (this
 1700:  * routine is called whenever IP gets a packet with proto type
 1701:  * ENCAP_PROTO and a local destination address).
 1702:  *
 1703:  * This is similar to mroute_encapcheck() + mroute_encap_input() in -current.
 1704:  */
 1705: static void
 1706: X_ipip_input(struct mbuf *m, int off, int proto)
 1707: {
 1708:     struct ip *ip = mtod(m, struct ip *);
 1709:     int hlen = ip->ip_hl << 2;
 1710: 
 1711:     if (!have_encap_tunnel) {
 1712: 	rip_input(m, off, proto);
 1713: 	return;
 1714:     }
 1715:     /*
 1716:      * dump the packet if it's not to a multicast destination or if
 1717:      * we don't have an encapsulating tunnel with the source.
 1718:      * Note:  This code assumes that the remote site IP address
 1719:      * uniquely identifies the tunnel (i.e., that this site has
 1720:      * at most one tunnel with the remote site).
 1721:      */
 1722:     if (!IN_MULTICAST(ntohl(((struct ip *)((char *)ip+hlen))->ip_dst.s_addr))) {
 1723: 	++mrtstat.mrts_bad_tunnel;
 1724: 	m_freem(m);
 1725: 	return;
 1726:     }
 1727:     if (ip->ip_src.s_addr != last_encap_src) {
 1728: 	struct vif *vifp = viftable;
 1729: 	struct vif *vife = vifp + numvifs;
 1730: 
 1731: 	last_encap_src = ip->ip_src.s_addr;
 1732: 	last_encap_vif = NULL;
 1733: 	for ( ; vifp < vife; ++vifp)
 1734: 	    if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) {
 1735: 		if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT))
 1736: 		    == VIFF_TUNNEL)
 1737: 		    last_encap_vif = vifp;
 1738: 		break;
 1739: 	    }
 1740:     }
 1741:     if (last_encap_vif == NULL) {
 1742: 	last_encap_src = INADDR_ANY;
 1743: 	mrtstat.mrts_cant_tunnel++; /*XXX*/
 1744: 	m_freem(m);
 1745: 	if (mrtdebug)
 1746: 	    log(LOG_DEBUG, "ip_mforward: no tunnel with %lx\n",
 1747: 		(u_long)ntohl(ip->ip_src.s_addr));
 1748: 	return;
 1749:     }
 1750: 
 1751:     if (hlen > sizeof(struct ip))
 1752: 	ip_stripoptions(m);
 1753:     m->m_data += sizeof(struct ip);
 1754:     m->m_len -= sizeof(struct ip);
 1755:     m->m_pkthdr.len -= sizeof(struct ip);
 1756:     m->m_pkthdr.rcvif = last_encap_vif->v_ifp;
 1757: 
 1758:     netisr_queue(NETISR_IP, m);
 1759: }
 1760: 
 1761: /*
 1762:  * Token bucket filter module
 1763:  */
 1764: 
 1765: static void
 1766: tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len)
 1767: {
 1768:     struct tbf *t = vifp->v_tbf;
 1769: 
 1770:     if (p_len > MAX_BKT_SIZE) {		/* drop if packet is too large */
 1771: 	mrtstat.mrts_pkt2large++;
 1772: 	m_freem(m);
 1773: 	return;
 1774:     }
 1775: 
 1776:     tbf_update_tokens(vifp);
 1777: 
 1778:     if (t->tbf_q_len == 0) {		/* queue empty...		*/
 1779: 	if (p_len <= t->tbf_n_tok) {	/* send packet if enough tokens	*/
 1780: 	    t->tbf_n_tok -= p_len;
 1781: 	    tbf_send_packet(vifp, m);
 1782: 	} else {			/* no, queue packet and try later */
 1783: 	    tbf_queue(vifp, m);
 1784: 	    timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
 1785: 	}
 1786:     } else if (t->tbf_q_len < t->tbf_max_q_len) {
 1787: 	/* finite queue length, so queue pkts and process queue */
 1788: 	tbf_queue(vifp, m);
 1789: 	tbf_process_q(vifp);
 1790:     } else {
 1791: 	/* queue full, try to dq and queue and process */
 1792: 	if (!tbf_dq_sel(vifp, ip)) {
 1793: 	    mrtstat.mrts_q_overflow++;
 1794: 	    m_freem(m);
 1795: 	} else {
 1796: 	    tbf_queue(vifp, m);
 1797: 	    tbf_process_q(vifp);
 1798: 	}
 1799:     }
 1800: }
 1801: 
 1802: /*
 1803:  * adds a packet to the queue at the interface
 1804:  */
 1805: static void
 1806: tbf_queue(struct vif *vifp, struct mbuf *m)
 1807: {
 1808:     int s = splnet();
 1809:     struct tbf *t = vifp->v_tbf;
 1810: 
 1811:     if (t->tbf_t == NULL)	/* Queue was empty */
 1812: 	t->tbf_q = m;
 1813:     else			/* Insert at tail */
 1814: 	t->tbf_t->m_act = m;
 1815: 
 1816:     t->tbf_t = m;		/* Set new tail pointer */
 1817: 
 1818: #ifdef DIAGNOSTIC
 1819:     /* Make sure we didn't get fed a bogus mbuf */
 1820:     if (m->m_act)
 1821: 	panic("tbf_queue: m_act");
 1822: #endif
 1823:     m->m_act = NULL;
 1824: 
 1825:     t->tbf_q_len++;
 1826: 
 1827:     splx(s);
 1828: }
 1829: 
 1830: /*
 1831:  * processes the queue at the interface
 1832:  */
 1833: static void
 1834: tbf_process_q(struct vif *vifp)
 1835: {
 1836:     int s = splnet();
 1837:     struct tbf *t = vifp->v_tbf;
 1838: 
 1839:     /* loop through the queue at the interface and send as many packets
 1840:      * as possible
 1841:      */
 1842:     while (t->tbf_q_len > 0) {
 1843: 	struct mbuf *m = t->tbf_q;
 1844: 	int len = mtod(m, struct ip *)->ip_len;
 1845: 
 1846: 	/* determine if the packet can be sent */
 1847: 	if (len > t->tbf_n_tok)	/* not enough tokens, we are done */
 1848: 	    break;
 1849: 	/* ok, reduce no of tokens, dequeue and send the packet. */
 1850: 	t->tbf_n_tok -= len;
 1851: 
 1852: 	t->tbf_q = m->m_act;
 1853: 	if (--t->tbf_q_len == 0)
 1854: 	    t->tbf_t = NULL;
 1855: 
 1856: 	m->m_act = NULL;
 1857: 	tbf_send_packet(vifp, m);
 1858:     }
 1859:     splx(s);
 1860: }
 1861: 
 1862: static void
 1863: tbf_reprocess_q(void *xvifp)
 1864: {
 1865:     struct vif *vifp = xvifp;
 1866: 
 1867:     if (ip_mrouter == NULL)
 1868: 	return;
 1869:     tbf_update_tokens(vifp);
 1870:     tbf_process_q(vifp);
 1871:     if (vifp->v_tbf->tbf_q_len)
 1872: 	timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
 1873: }
 1874: 
 1875: /* function that will selectively discard a member of the queue
 1876:  * based on the precedence value and the priority
 1877:  */
 1878: static int
 1879: tbf_dq_sel(struct vif *vifp, struct ip *ip)
 1880: {
 1881:     int s = splnet();
 1882:     u_int p;
 1883:     struct mbuf *m, *last;
 1884:     struct mbuf **np;
 1885:     struct tbf *t = vifp->v_tbf;
 1886: 
 1887:     p = priority(vifp, ip);
 1888: 
 1889:     np = &t->tbf_q;
 1890:     last = NULL;
 1891:     while ((m = *np) != NULL) {
 1892: 	if (p > priority(vifp, mtod(m, struct ip *))) {
 1893: 	    *np = m->m_act;
 1894: 	    /* If we're removing the last packet, fix the tail pointer */
 1895: 	    if (m == t->tbf_t)
 1896: 		t->tbf_t = last;
 1897: 	    m_freem(m);
 1898: 	    /* It's impossible for the queue to be empty, but check anyways. */
 1899: 	    if (--t->tbf_q_len == 0)
 1900: 		t->tbf_t = NULL;
 1901: 	    splx(s);
 1902: 	    mrtstat.mrts_drop_sel++;
 1903: 	    return 1;
 1904: 	}
 1905: 	np = &m->m_act;
 1906: 	last = m;
 1907:     }
 1908:     splx(s);
 1909:     return 0;
 1910: }
 1911: 
 1912: static void
 1913: tbf_send_packet(struct vif *vifp, struct mbuf *m)
 1914: {
 1915:     int s = splnet();
 1916: 
 1917:     if (vifp->v_flags & VIFF_TUNNEL)	/* If tunnel options */
 1918: 	ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL);
 1919:     else {
 1920: 	struct ip_moptions imo;
 1921: 	int error;
 1922: 	static struct route ro; /* XXX check this */
 1923: 
 1924: 	imo.imo_multicast_ifp  = vifp->v_ifp;
 1925: 	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
 1926: 	imo.imo_multicast_loop = 1;
 1927: 	imo.imo_multicast_vif  = -1;
 1928: 
 1929: 	/*
 1930: 	 * Re-entrancy should not be a problem here, because
 1931: 	 * the packets that we send out and are looped back at us
 1932: 	 * should get rejected because they appear to come from
 1933: 	 * the loopback interface, thus preventing looping.
 1934: 	 */
 1935: 	error = ip_output(m, NULL, &ro, IP_FORWARDING, &imo, NULL);
 1936: 
 1937: 	if (mrtdebug & DEBUG_XMIT)
 1938: 	    log(LOG_DEBUG, "phyint_send on vif %d err %d\n",
 1939: 		(int)(vifp - viftable), error);
 1940:     }
 1941:     splx(s);
 1942: }
 1943: 
 1944: /* determine the current time and then
 1945:  * the elapsed time (between the last time and time now)
 1946:  * in milliseconds & update the no. of tokens in the bucket
 1947:  */
 1948: static void
 1949: tbf_update_tokens(struct vif *vifp)
 1950: {
 1951:     struct timeval tp;
 1952:     u_long tm;
 1953:     int s = splnet();
 1954:     struct tbf *t = vifp->v_tbf;
 1955: 
 1956:     GET_TIME(tp);
 1957: 
 1958:     TV_DELTA(tp, t->tbf_last_pkt_t, tm);
 1959: 
 1960:     /*
 1961:      * This formula is actually
 1962:      * "time in seconds" * "bytes/second".
 1963:      *
 1964:      * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
 1965:      *
 1966:      * The (1000/1024) was introduced in add_vif to optimize
 1967:      * this divide into a shift.
 1968:      */
 1969:     t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8;
 1970:     t->tbf_last_pkt_t = tp;
 1971: 
 1972:     if (t->tbf_n_tok > MAX_BKT_SIZE)
 1973: 	t->tbf_n_tok = MAX_BKT_SIZE;
 1974: 
 1975:     splx(s);
 1976: }
 1977: 
 1978: static int
 1979: priority(struct vif *vifp, struct ip *ip)
 1980: {
 1981:     int prio = 50; /* the lowest priority -- default case */
 1982: 
 1983:     /* temporary hack; may add general packet classifier some day */
 1984: 
 1985:     /*
 1986:      * The UDP port space is divided up into four priority ranges:
 1987:      * [0, 16384)     : unclassified - lowest priority
 1988:      * [16384, 32768) : audio - highest priority
 1989:      * [32768, 49152) : whiteboard - medium priority
 1990:      * [49152, 65536) : video - low priority
 1991:      *
 1992:      * Everything else gets lowest priority.
 1993:      */
 1994:     if (ip->ip_p == IPPROTO_UDP) {
 1995: 	struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
 1996: 	switch (ntohs(udp->uh_dport) & 0xc000) {
 1997: 	case 0x4000:
 1998: 	    prio = 70;
 1999: 	    break;
 2000: 	case 0x8000:
 2001: 	    prio = 60;
 2002: 	    break;
 2003: 	case 0xc000:
 2004: 	    prio = 55;
 2005: 	    break;
 2006: 	}
 2007:     }
 2008:     return prio;
 2009: }
 2010: 
 2011: /*
 2012:  * End of token bucket filter modifications
 2013:  */
 2014: 
 2015: static int
 2016: X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt)
 2017: {
 2018:     int error, vifi, s;
 2019: 
 2020:     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
 2021: 	return EOPNOTSUPP;
 2022: 
 2023:     error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
 2024:     if (error)
 2025: 	return error;
 2026: 
 2027:     s = splnet();
 2028: 
 2029:     if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */
 2030: 	splx(s);
 2031: 	return EADDRNOTAVAIL;
 2032:     }
 2033: 
 2034:     if (sopt->sopt_name == IP_RSVP_VIF_ON) {
 2035: 	/* Check if socket is available. */
 2036: 	if (viftable[vifi].v_rsvpd != NULL) {
 2037: 	    splx(s);
 2038: 	    return EADDRINUSE;
 2039: 	}
 2040: 
 2041: 	viftable[vifi].v_rsvpd = so;
 2042: 	/* This may seem silly, but we need to be sure we don't over-increment
 2043: 	 * the RSVP counter, in case something slips up.
 2044: 	 */
 2045: 	if (!viftable[vifi].v_rsvp_on) {
 2046: 	    viftable[vifi].v_rsvp_on = 1;
 2047: 	    rsvp_on++;
 2048: 	}
 2049:     } else { /* must be VIF_OFF */
 2050: 	/*
 2051: 	 * XXX as an additional consistency check, one could make sure
 2052: 	 * that viftable[vifi].v_rsvpd == so, otherwise passing so as
 2053: 	 * first parameter is pretty useless.
 2054: 	 */
 2055: 	viftable[vifi].v_rsvpd = NULL;
 2056: 	/*
 2057: 	 * This may seem silly, but we need to be sure we don't over-decrement
 2058: 	 * the RSVP counter, in case something slips up.
 2059: 	 */
 2060: 	if (viftable[vifi].v_rsvp_on) {
 2061: 	    viftable[vifi].v_rsvp_on = 0;
 2062: 	    rsvp_on--;
 2063: 	}
 2064:     }
 2065:     splx(s);
 2066:     return 0;
 2067: }
 2068: 
 2069: static void
 2070: X_ip_rsvp_force_done(struct socket *so)
 2071: {
 2072:     int vifi;
 2073:     int s;
 2074: 
 2075:     /* Don't bother if it is not the right type of socket. */
 2076:     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
 2077: 	return;
 2078: 
 2079:     s = splnet();
 2080: 
 2081:     /* The socket may be attached to more than one vif...this
 2082:      * is perfectly legal.
 2083:      */
 2084:     for (vifi = 0; vifi < numvifs; vifi++) {
 2085: 	if (viftable[vifi].v_rsvpd == so) {
 2086: 	    viftable[vifi].v_rsvpd = NULL;
 2087: 	    /* This may seem silly, but we need to be sure we don't
 2088: 	     * over-decrement the RSVP counter, in case something slips up.
 2089: 	     */
 2090: 	    if (viftable[vifi].v_rsvp_on) {
 2091: 		viftable[vifi].v_rsvp_on = 0;
 2092: 		rsvp_on--;
 2093: 	    }
 2094: 	}
 2095:     }
 2096: 
 2097:     splx(s);
 2098: }
 2099: 
 2100: static void
 2101: X_rsvp_input(struct mbuf *m, int off, int proto)
 2102: {
 2103:     int vifi;
 2104:     struct ip *ip = mtod(m, struct ip *);
 2105:     struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET };
 2106:     int s;
 2107:     struct ifnet *ifp;
 2108: 
 2109:     if (rsvpdebug)
 2110: 	printf("rsvp_input: rsvp_on %d\n",rsvp_on);
 2111: 
 2112:     /* Can still get packets with rsvp_on = 0 if there is a local member
 2113:      * of the group to which the RSVP packet is addressed.  But in this
 2114:      * case we want to throw the packet away.
 2115:      */
 2116:     if (!rsvp_on) {
 2117: 	m_freem(m);
 2118: 	return;
 2119:     }
 2120: 
 2121:     s = splnet();
 2122: 
 2123:     if (rsvpdebug)
 2124: 	printf("rsvp_input: check vifs\n");
 2125: 
 2126: #ifdef DIAGNOSTIC
 2127:     if (!(m->m_flags & M_PKTHDR))
 2128: 	panic("rsvp_input no hdr");
 2129: #endif
 2130: 
 2131:     ifp = m->m_pkthdr.rcvif;
 2132:     /* Find which vif the packet arrived on. */
 2133:     for (vifi = 0; vifi < numvifs; vifi++)
 2134: 	if (viftable[vifi].v_ifp == ifp)
 2135: 	    break;
 2136: 
 2137:     if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) {
 2138: 	/*
 2139: 	 * If the old-style non-vif-associated socket is set,
 2140: 	 * then use it.  Otherwise, drop packet since there
 2141: 	 * is no specific socket for this vif.
 2142: 	 */
 2143: 	if (ip_rsvpd != NULL) {
 2144: 	    if (rsvpdebug)
 2145: 		printf("rsvp_input: Sending packet up old-style socket\n");
 2146: 	    rip_input(m, off, proto);  /* xxx */
 2147: 	} else {
 2148: 	    if (rsvpdebug && vifi == numvifs)
 2149: 		printf("rsvp_input: Can't find vif for packet.\n");
 2150: 	    else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL)
 2151: 		printf("rsvp_input: No socket defined for vif %d\n",vifi);
 2152: 	    m_freem(m);
 2153: 	}
 2154: 	splx(s);
 2155: 	return;
 2156:     }
 2157:     rsvp_src.sin_addr = ip->ip_src;
 2158: 
 2159:     if (rsvpdebug && m)
 2160: 	printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n",
 2161: 	       m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv)));
 2162: 
 2163:     if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) {
 2164: 	if (rsvpdebug)
 2165: 	    printf("rsvp_input: Failed to append to socket\n");
 2166:     } else {
 2167: 	if (rsvpdebug)
 2168: 	    printf("rsvp_input: send packet up\n");
 2169:     }
 2170: 
 2171:     splx(s);
 2172: }
 2173: 
 2174: /*
 2175:  * Code for bandwidth monitors
 2176:  */
 2177: 
 2178: /*
 2179:  * Define common interface for timeval-related methods
 2180:  */
 2181: #define	BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
 2182: #define	BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
 2183: #define	BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
 2184: 
 2185: static uint32_t
 2186: compute_bw_meter_flags(struct bw_upcall *req)
 2187: {
 2188:     uint32_t flags = 0;
 2189: 
 2190:     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 2191: 	flags |= BW_METER_UNIT_PACKETS;
 2192:     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 2193: 	flags |= BW_METER_UNIT_BYTES;
 2194:     if (req->bu_flags & BW_UPCALL_GEQ)
 2195: 	flags |= BW_METER_GEQ;
 2196:     if (req->bu_flags & BW_UPCALL_LEQ)
 2197: 	flags |= BW_METER_LEQ;
 2198:     
 2199:     return flags;
 2200: }
 2201:  
 2202: /*
 2203:  * Add a bw_meter entry
 2204:  */
 2205: static int
 2206: add_bw_upcall(struct bw_upcall *req)
 2207: {
 2208:     struct mfc *mfc;
 2209:     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 2210: 		BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
 2211:     struct timeval now;
 2212:     struct bw_meter *x;
 2213:     uint32_t flags;
 2214:     int s;
 2215:     
 2216:     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2217: 	return EOPNOTSUPP;
 2218:     
 2219:     /* Test if the flags are valid */
 2220:     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 2221: 	return EINVAL;
 2222:     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 2223: 	return EINVAL;
 2224:     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2225: 	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 2226: 	return EINVAL;
 2227:     
 2228:     /* Test if the threshold time interval is valid */
 2229:     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 2230: 	return EINVAL;
 2231:     
 2232:     flags = compute_bw_meter_flags(req);
 2233: 
 2234:     /*
 2235:      * Find if we have already same bw_meter entry
 2236:      */
 2237:     s = splnet();
 2238:     mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr);
 2239:     if (mfc == NULL) {
 2240: 	splx(s);
 2241: 	return EADDRNOTAVAIL;
 2242:     }
 2243:     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 2244: 	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2245: 			   &req->bu_threshold.b_time, ==)) &&
 2246: 	    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2247: 	    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2248: 	    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 2249: 	    splx(s);
 2250: 	    return 0;		/* XXX Already installed */
 2251: 	}
 2252:     }
 2253:     splx(s);
 2254:     
 2255:     /* Allocate the new bw_meter entry */
 2256:     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
 2257:     if (x == NULL)
 2258: 	return ENOBUFS;
 2259:     
 2260:     /* Set the new bw_meter entry */
 2261:     x->bm_threshold.b_time = req->bu_threshold.b_time;
 2262:     GET_TIME(now);
 2263:     x->bm_start_time = now;
 2264:     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
 2265:     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
 2266:     x->bm_measured.b_packets = 0;
 2267:     x->bm_measured.b_bytes = 0;
 2268:     x->bm_flags = flags;
 2269:     x->bm_time_next = NULL;
 2270:     x->bm_time_hash = BW_METER_BUCKETS;
 2271:     
 2272:     /* Add the new bw_meter entry to the front of entries for this MFC */
 2273:     s = splnet();
 2274:     x->bm_mfc = mfc;
 2275:     x->bm_mfc_next = mfc->mfc_bw_meter;
 2276:     mfc->mfc_bw_meter = x;
 2277:     schedule_bw_meter(x, &now);
 2278:     splx(s);
 2279:     
 2280:     return 0;
 2281: }
 2282: 
 2283: static void
 2284: free_bw_list(struct bw_meter *list)
 2285: {
 2286:     while (list != NULL) {
 2287: 	struct bw_meter *x = list;
 2288: 
 2289: 	list = list->bm_mfc_next;
 2290: 	unschedule_bw_meter(x);
 2291: 	free(x, M_BWMETER);
 2292:     }
 2293: }
 2294: 
 2295: /*
 2296:  * Delete one or multiple bw_meter entries
 2297:  */
 2298: static int
 2299: del_bw_upcall(struct bw_upcall *req)
 2300: {
 2301:     struct mfc *mfc;
 2302:     struct bw_meter *x;
 2303:     int s;
 2304:     
 2305:     if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
 2306: 	return EOPNOTSUPP;
 2307:     
 2308:     s = splnet();
 2309:     /* Find the corresponding MFC entry */
 2310:     mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr);
 2311:     if (mfc == NULL) {
 2312: 	splx(s);
 2313: 	return EADDRNOTAVAIL;
 2314:     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 2315: 	/*
 2316: 	 * Delete all bw_meter entries for this mfc
 2317: 	 */
 2318: 	struct bw_meter *list;
 2319: 	
 2320: 	list = mfc->mfc_bw_meter;
 2321: 	mfc->mfc_bw_meter = NULL;
 2322: 	splx(s);
 2323: 	free_bw_list(list);
 2324: 	return 0;
 2325:     } else {			/* Delete a single bw_meter entry */
 2326: 	struct bw_meter *prev;
 2327: 	uint32_t flags = 0;
 2328: 
 2329: 	flags = compute_bw_meter_flags(req);
 2330: 
 2331: 	/* Find the bw_meter entry to delete */
 2332: 	for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 2333: 	     x = x->bm_mfc_next) {
 2334: 	    if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 2335: 			       &req->bu_threshold.b_time, ==)) &&
 2336: 		(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 2337: 		(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 2338: 		(x->bm_flags & BW_METER_USER_FLAGS) == flags)
 2339: 		break;
 2340: 	}
 2341: 	if (x != NULL) { /* Delete entry from the list for this MFC */
 2342: 	    if (prev != NULL)
 2343: 		prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
 2344: 	    else
 2345: 		x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 2346: 	    splx(s);
 2347: 
 2348: 	    unschedule_bw_meter(x);
 2349: 	    /* Free the bw_meter entry */
 2350: 	    free(x, M_BWMETER);
 2351: 	    return 0;
 2352: 	} else {
 2353: 	    splx(s);
 2354: 	    return EINVAL;
 2355: 	}
 2356:     }
 2357:     /* NOTREACHED */
 2358: }
 2359: 
 2360: /*
 2361:  * Perform bandwidth measurement processing that may result in an upcall
 2362:  */
 2363: static void
 2364: bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 2365: {
 2366:     struct timeval delta;
 2367:     int s;
 2368:     
 2369:     s = splnet();
 2370:     delta = *nowp;
 2371:     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2372:     
 2373:     if (x->bm_flags & BW_METER_GEQ) {
 2374: 	/*
 2375: 	 * Processing for ">=" type of bw_meter entry
 2376: 	 */
 2377: 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2378: 	    /* Reset the bw_meter entry */
 2379: 	    x->bm_start_time = *nowp;
 2380: 	    x->bm_measured.b_packets = 0;
 2381: 	    x->bm_measured.b_bytes = 0;
 2382: 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2383: 	}
 2384: 	
 2385: 	/* Record that a packet is received */
 2386: 	x->bm_measured.b_packets++;
 2387: 	x->bm_measured.b_bytes += plen;
 2388: 	
 2389: 	/*
 2390: 	 * Test if we should deliver an upcall
 2391: 	 */
 2392: 	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {	
 2393: 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2394: 		 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 2395: 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2396: 		 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 2397: 		/* Prepare an upcall for delivery */
 2398: 		bw_meter_prepare_upcall(x, nowp);
 2399: 		x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 2400: 	    }
 2401: 	}
 2402:     } else if (x->bm_flags & BW_METER_LEQ) {
 2403: 	/*
 2404: 	 * Processing for "<=" type of bw_meter entry
 2405: 	 */
 2406: 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 2407: 	    /*
 2408: 	     * We are behind time with the multicast forwarding table
 2409: 	     * scanning for "<=" type of bw_meter entries, so test now
 2410: 	     * if we should deliver an upcall.
 2411: 	     */
 2412: 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2413: 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2414: 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2415: 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2416: 		/* Prepare an upcall for delivery */
 2417: 		bw_meter_prepare_upcall(x, nowp);
 2418: 	    }
 2419: 	    /* Reschedule the bw_meter entry */
 2420: 	    unschedule_bw_meter(x);
 2421: 	    schedule_bw_meter(x, nowp);
 2422: 	}
 2423: 	
 2424: 	/* Record that a packet is received */
 2425: 	x->bm_measured.b_packets++;
 2426: 	x->bm_measured.b_bytes += plen;
 2427: 	
 2428: 	/*
 2429: 	 * Test if we should restart the measuring interval
 2430: 	 */
 2431: 	if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 2432: 	     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 2433: 	    (x->bm_flags & BW_METER_UNIT_BYTES &&
 2434: 	     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 2435: 	    /* Don't restart the measuring interval */
 2436: 	} else {
 2437: 	    /* Do restart the measuring interval */
 2438: 	    /*
 2439: 	     * XXX: note that we don't unschedule and schedule, because this
 2440: 	     * might be too much overhead per packet. Instead, when we process
 2441: 	     * all entries for a given timer hash bin, we check whether it is
 2442: 	     * really a timeout. If not, we reschedule at that time.
 2443: 	     */
 2444: 	    x->bm_start_time = *nowp;
 2445: 	    x->bm_measured.b_packets = 0;
 2446: 	    x->bm_measured.b_bytes = 0;
 2447: 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2448: 	}
 2449:     }
 2450:     splx(s);
 2451: }
 2452: 
 2453: /*
 2454:  * Prepare a bandwidth-related upcall
 2455:  */
 2456: static void
 2457: bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 2458: {
 2459:     struct timeval delta;
 2460:     struct bw_upcall *u;
 2461:     int s;
 2462:     
 2463:     s = splnet();
 2464:     
 2465:     /*
 2466:      * Compute the measured time interval 
 2467:      */
 2468:     delta = *nowp;
 2469:     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 2470:     
 2471:     /*
 2472:      * If there are too many pending upcalls, deliver them now
 2473:      */
 2474:     if (bw_upcalls_n >= BW_UPCALLS_MAX)
 2475: 	bw_upcalls_send();
 2476:     
 2477:     /*
 2478:      * Set the bw_upcall entry
 2479:      */
 2480:     u = &bw_upcalls[bw_upcalls_n++];
 2481:     u->bu_src = x->bm_mfc->mfc_origin;
 2482:     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
 2483:     u->bu_threshold.b_time = x->bm_threshold.b_time;
 2484:     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
 2485:     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
 2486:     u->bu_measured.b_time = delta;
 2487:     u->bu_measured.b_packets = x->bm_measured.b_packets;
 2488:     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
 2489:     u->bu_flags = 0;
 2490:     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 2491: 	u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
 2492:     if (x->bm_flags & BW_METER_UNIT_BYTES)
 2493: 	u->bu_flags |= BW_UPCALL_UNIT_BYTES;
 2494:     if (x->bm_flags & BW_METER_GEQ)
 2495: 	u->bu_flags |= BW_UPCALL_GEQ;
 2496:     if (x->bm_flags & BW_METER_LEQ)
 2497: 	u->bu_flags |= BW_UPCALL_LEQ;
 2498:     
 2499:     splx(s);
 2500: }
 2501: 
 2502: /*
 2503:  * Send the pending bandwidth-related upcalls
 2504:  */
 2505: static void
 2506: bw_upcalls_send(void)
 2507: {
 2508:     struct mbuf *m;
 2509:     int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
 2510:     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 2511:     static struct igmpmsg igmpmsg = { 0,		/* unused1 */
 2512: 				      0,		/* unused2 */
 2513: 				      IGMPMSG_BW_UPCALL,/* im_msgtype */
 2514: 				      0,		/* im_mbz  */
 2515: 				      0,		/* im_vif  */
 2516: 				      0,		/* unused3 */
 2517: 				      { 0 },		/* im_src  */
 2518: 				      { 0 } };		/* im_dst  */
 2519:     
 2520:     if (bw_upcalls_n == 0)
 2521: 	return;			/* No pending upcalls */
 2522: 
 2523:     bw_upcalls_n = 0;
 2524:     
 2525:     /*
 2526:      * Allocate a new mbuf, initialize it with the header and
 2527:      * the payload for the pending calls.
 2528:      */
 2529:     MGETHDR(m, M_DONTWAIT, MT_HEADER);
 2530:     if (m == NULL) {
 2531: 	log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 2532: 	return;
 2533:     }
 2534:     
 2535:     m->m_len = m->m_pkthdr.len = 0;
 2536:     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
 2537:     m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]);
 2538:     
 2539:     /*
 2540:      * Send the upcalls
 2541:      * XXX do we need to set the address in k_igmpsrc ?
 2542:      */
 2543:     mrtstat.mrts_upcalls++;
 2544:     if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
 2545: 	log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 2546: 	++mrtstat.mrts_upq_sockfull;
 2547:     }
 2548: }
 2549: 
 2550: /*
 2551:  * Compute the timeout hash value for the bw_meter entries
 2552:  */
 2553: #define	BW_METER_TIMEHASH(bw_meter, hash)				\
 2554:     do {								\
 2555: 	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
 2556: 									\
 2557: 	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 2558: 	(hash) = next_timeval.tv_sec;					\
 2559: 	if (next_timeval.tv_usec)					\
 2560: 	    (hash)++; /* XXX: make sure we don't timeout early */	\
 2561: 	(hash) %= BW_METER_BUCKETS;					\
 2562:     } while (0)
 2563: 
 2564: /*
 2565:  * Schedule a timer to process periodically bw_meter entry of type "<="
 2566:  * by linking the entry in the proper hash bucket.
 2567:  */
 2568: static void
 2569: schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 2570: {
 2571:     int time_hash, s;
 2572:     
 2573:     if (!(x->bm_flags & BW_METER_LEQ))
 2574: 	return;		/* XXX: we schedule timers only for "<=" entries */
 2575:     
 2576:     /*
 2577:      * Reset the bw_meter entry
 2578:      */
 2579:     s = splnet();
 2580:     x->bm_start_time = *nowp;
 2581:     x->bm_measured.b_packets = 0;
 2582:     x->bm_measured.b_bytes = 0;
 2583:     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 2584:     splx(s);
 2585:     
 2586:     /*
 2587:      * Compute the timeout hash value and insert the entry
 2588:      */
 2589:     BW_METER_TIMEHASH(x, time_hash);
 2590:     x->bm_time_next = bw_meter_timers[time_hash];
 2591:     bw_meter_timers[time_hash] = x;
 2592:     x->bm_time_hash = time_hash;
 2593: }
 2594: 
 2595: /*
 2596:  * Unschedule the periodic timer that processes bw_meter entry of type "<="
 2597:  * by removing the entry from the proper hash bucket.
 2598:  */
 2599: static void
 2600: unschedule_bw_meter(struct bw_meter *x)
 2601: {
 2602:     int time_hash;
 2603:     struct bw_meter *prev, *tmp;
 2604:     
 2605:     if (!(x->bm_flags & BW_METER_LEQ))
 2606: 	return;		/* XXX: we schedule timers only for "<=" entries */
 2607:     
 2608:     /*
 2609:      * Compute the timeout hash value and delete the entry
 2610:      */
 2611:     time_hash = x->bm_time_hash;
 2612:     if (time_hash >= BW_METER_BUCKETS)
 2613: 	return;		/* Entry was not scheduled */
 2614:     
 2615:     for (prev = NULL, tmp = bw_meter_timers[time_hash];
 2616: 	     tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 2617: 	if (tmp == x)
 2618: 	    break;
 2619:     
 2620:     if (tmp == NULL)
 2621: 	panic("unschedule_bw_meter: bw_meter entry not found");
 2622:     
 2623:     if (prev != NULL)
 2624: 	prev->bm_time_next = x->bm_time_next;
 2625:     else
 2626: 	bw_meter_timers[time_hash] = x->bm_time_next;
 2627:     
 2628:     x->bm_time_next = NULL;
 2629:     x->bm_time_hash = BW_METER_BUCKETS;
 2630: }
 2631: 
 2632: 
 2633: /*
 2634:  * Process all "<=" type of bw_meter that should be processed now,
 2635:  * and for each entry prepare an upcall if necessary. Each processed
 2636:  * entry is rescheduled again for the (periodic) processing.
 2637:  *
 2638:  * This is run periodically (once per second normally). On each round,
 2639:  * all the potentially matching entries are in the hash slot that we are
 2640:  * looking at.
 2641:  */
 2642: static void
 2643: bw_meter_process()
 2644: {
 2645:     static uint32_t last_tv_sec;	/* last time we processed this */
 2646: 
 2647:     uint32_t loops;
 2648:     int i, s;
 2649:     struct timeval now, process_endtime;
 2650:     
 2651:     GET_TIME(now);
 2652:     if (last_tv_sec == now.tv_sec)
 2653: 	return;		/* nothing to do */
 2654: 
 2655:     s = splnet();
 2656:     loops = now.tv_sec - last_tv_sec;
 2657:     last_tv_sec = now.tv_sec;
 2658:     if (loops > BW_METER_BUCKETS)
 2659: 	loops = BW_METER_BUCKETS;
 2660: 
 2661:     /*
 2662:      * Process all bins of bw_meter entries from the one after the last
 2663:      * processed to the current one. On entry, i points to the last bucket
 2664:      * visited, so we need to increment i at the beginning of the loop.
 2665:      */
 2666:     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 2667: 	struct bw_meter *x, *tmp_list;
 2668: 	
 2669: 	if (++i >= BW_METER_BUCKETS)
 2670: 	    i = 0;
 2671: 	
 2672: 	/* Disconnect the list of bw_meter entries from the bin */
 2673: 	tmp_list = bw_meter_timers[i];
 2674: 	bw_meter_timers[i] = NULL;
 2675: 	
 2676: 	/* Process the list of bw_meter entries */
 2677: 	while (tmp_list != NULL) {
 2678: 	    x = tmp_list;
 2679: 	    tmp_list = tmp_list->bm_time_next;
 2680: 	    
 2681: 	    /* Test if the time interval is over */
 2682: 	    process_endtime = x->bm_start_time;
 2683: 	    BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 2684: 	    if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 2685: 		/* Not yet: reschedule, but don't reset */
 2686: 		int time_hash;
 2687: 		
 2688: 		BW_METER_TIMEHASH(x, time_hash);
 2689: 		if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 2690: 		    /*
 2691: 		     * XXX: somehow the bin processing is a bit ahead of time.
 2692: 		     * Put the entry in the next bin.
 2693: 		     */
 2694: 		    if (++time_hash >= BW_METER_BUCKETS)
 2695: 			time_hash = 0;
 2696: 		}
 2697: 		x->bm_time_next = bw_meter_timers[time_hash];
 2698: 		bw_meter_timers[time_hash] = x;
 2699: 		x->bm_time_hash = time_hash;
 2700: 		
 2701: 		continue;
 2702: 	    }
 2703: 	    
 2704: 	    /*
 2705: 	     * Test if we should deliver an upcall
 2706: 	     */
 2707: 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 2708: 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 2709: 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 2710: 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 2711: 		/* Prepare an upcall for delivery */
 2712: 		bw_meter_prepare_upcall(x, &now);
 2713: 	    }
 2714: 	    
 2715: 	    /*
 2716: 	     * Reschedule for next processing
 2717: 	     */
 2718: 	    schedule_bw_meter(x, &now);
 2719: 	}
 2720:     }
 2721:     splx(s);
 2722:     
 2723:     /* Send all upcalls that are pending delivery */
 2724:     bw_upcalls_send();
 2725: }
 2726: 
 2727: /*
 2728:  * A periodic function for sending all upcalls that are pending delivery
 2729:  */
 2730: static void
 2731: expire_bw_upcalls_send(void *unused)
 2732: {
 2733:     bw_upcalls_send();
 2734:     
 2735:     bw_upcalls_ch = timeout(expire_bw_upcalls_send, NULL, BW_UPCALLS_PERIOD);
 2736: }
 2737: 
 2738: /*
 2739:  * A periodic function for periodic scanning of the multicast forwarding
 2740:  * table for processing all "<=" bw_meter entries.
 2741:  */
 2742: static void
 2743: expire_bw_meter_process(void *unused)
 2744: {
 2745:     if (mrt_api_config & MRT_MFC_BW_UPCALL)
 2746: 	bw_meter_process();
 2747:     
 2748:     bw_meter_ch = timeout(expire_bw_meter_process, NULL, BW_METER_PERIOD);
 2749: }
 2750: 
 2751: /*
 2752:  * End of bandwidth monitoring code
 2753:  */
 2754: 
 2755: #ifdef PIM
 2756: /*
 2757:  * Send the packet up to the user daemon, or eventually do kernel encapsulation
 2758:  *
 2759:  */
 2760: static int
 2761: pim_register_send(struct ip *ip, struct vif *vifp,
 2762: 	struct mbuf *m, struct mfc *rt)
 2763: {
 2764:     struct mbuf *mb_copy, *mm;
 2765:     
 2766:     if (mrtdebug & DEBUG_PIM)
 2767:         log(LOG_DEBUG, "pim_register_send: ");
 2768:     
 2769:     mb_copy = pim_register_prepare(ip, m);
 2770:     if (mb_copy == NULL)
 2771: 	return ENOBUFS;
 2772:     
 2773:     /*
 2774:      * Send all the fragments. Note that the mbuf for each fragment
 2775:      * is freed by the sending machinery.
 2776:      */
 2777:     for (mm = mb_copy; mm; mm = mb_copy) {
 2778: 	mb_copy = mm->m_nextpkt;
 2779: 	mm->m_nextpkt = 0;
 2780: 	mm = m_pullup(mm, sizeof(struct ip));
 2781: 	if (mm != NULL) {
 2782: 	    ip = mtod(mm, struct ip *);
 2783: 	    if ((mrt_api_config & MRT_MFC_RP) &&
 2784: 		(rt->mfc_rp.s_addr != INADDR_ANY)) {
 2785: 		pim_register_send_rp(ip, vifp, mm, rt);
 2786: 	    } else {
 2787: 		pim_register_send_upcall(ip, vifp, mm, rt);
 2788: 	    }
 2789: 	}
 2790:     }
 2791:     
 2792:     return 0;
 2793: }
 2794: 
 2795: /*
 2796:  * Return a copy of the data packet that is ready for PIM Register
 2797:  * encapsulation.
 2798:  * XXX: Note that in the returned copy the IP header is a valid one.
 2799:  */
 2800: static struct mbuf *
 2801: pim_register_prepare(struct ip *ip, struct mbuf *m)
 2802: {
 2803:     struct mbuf *mb_copy = NULL;
 2804:     int mtu;
 2805:     
 2806:     /* Take care of delayed checksums */
 2807:     if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 2808: 	in_delayed_cksum(m);
 2809: 	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 2810:     }
 2811: 
 2812:     /*
 2813:      * Copy the old packet & pullup its IP header into the
 2814:      * new mbuf so we can modify it.
 2815:      */
 2816:     mb_copy = m_copypacket(m, M_DONTWAIT);
 2817:     if (mb_copy == NULL)
 2818: 	return NULL;
 2819:     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
 2820:     if (mb_copy == NULL)
 2821: 	return NULL;
 2822:     
 2823:     /* take care of the TTL */
 2824:     ip = mtod(mb_copy, struct ip *);
 2825:     --ip->ip_ttl;
 2826:     
 2827:     /* Compute the MTU after the PIM Register encapsulation */
 2828:     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 2829:     
 2830:     if (ip->ip_len <= mtu) {
 2831: 	/* Turn the IP header into a valid one */
 2832: 	ip->ip_len = htons(ip->ip_len);
 2833: 	ip->ip_off = htons(ip->ip_off);
 2834: 	ip->ip_sum = 0;
 2835: 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
 2836:     } else {
 2837: 	/* Fragment the packet */
 2838: 	if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) {
 2839: 	    m_freem(mb_copy);
 2840: 	    return NULL;
 2841: 	}
 2842:     }
 2843:     return mb_copy;
 2844: }
 2845: 
 2846: /*
 2847:  * Send an upcall with the data packet to the user-level process.
 2848:  */
 2849: static int
 2850: pim_register_send_upcall(struct ip *ip, struct vif *vifp,
 2851: 	struct mbuf *mb_copy, struct mfc *rt)
 2852: {
 2853:     struct mbuf *mb_first;
 2854:     int len = ntohs(ip->ip_len);
 2855:     struct igmpmsg *im;
 2856:     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 2857:     
 2858:     /*
 2859:      * Add a new mbuf with an upcall header
 2860:      */
 2861:     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 2862:     if (mb_first == NULL) {
 2863: 	m_freem(mb_copy);
 2864: 	return ENOBUFS;
 2865:     }
 2866:     mb_first->m_data += max_linkhdr;
 2867:     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
 2868:     mb_first->m_len = sizeof(struct igmpmsg);
 2869:     mb_first->m_next = mb_copy;
 2870:     
 2871:     /* Send message to routing daemon */
 2872:     im = mtod(mb_first, struct igmpmsg *);
 2873:     im->im_msgtype	= IGMPMSG_WHOLEPKT;
 2874:     im->im_mbz		= 0;
 2875:     im->im_vif		= vifp - viftable;
 2876:     im->im_src		= ip->ip_src;
 2877:     im->im_dst		= ip->ip_dst;
 2878:     
 2879:     k_igmpsrc.sin_addr	= ip->ip_src;
 2880:     
 2881:     mrtstat.mrts_upcalls++;
 2882:     
 2883:     if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 2884: 	if (mrtdebug & DEBUG_PIM)
 2885: 	    log(LOG_WARNING,
 2886: 		"mcast: pim_register_send_upcall: ip_mrouter socket queue full");
 2887: 	++mrtstat.mrts_upq_sockfull;
 2888: 	return ENOBUFS;
 2889:     }
 2890:     
 2891:     /* Keep statistics */
 2892:     pimstat.pims_snd_registers_msgs++;
 2893:     pimstat.pims_snd_registers_bytes += len;
 2894:     
 2895:     return 0;
 2896: }
 2897: 
 2898: /*
 2899:  * Encapsulate the data packet in PIM Register message and send it to the RP.
 2900:  */
 2901: static int
 2902: pim_register_send_rp(struct ip *ip, struct vif *vifp,
 2903: 	struct mbuf *mb_copy, struct mfc *rt)
 2904: {
 2905:     struct mbuf *mb_first;
 2906:     struct ip *ip_outer;
 2907:     struct pim_encap_pimhdr *pimhdr;
 2908:     int len = ntohs(ip->ip_len);
 2909:     vifi_t vifi = rt->mfc_parent;
 2910:     
 2911:     if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) {
 2912: 	m_freem(mb_copy);
 2913: 	return EADDRNOTAVAIL;		/* The iif vif is invalid */
 2914:     }
 2915:     
 2916:     /*
 2917:      * Add a new mbuf with the encapsulating header
 2918:      */
 2919:     MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
 2920:     if (mb_first == NULL) {
 2921: 	m_freem(mb_copy);
 2922: 	return ENOBUFS;
 2923:     }
 2924:     mb_first->m_data += max_linkhdr;
 2925:     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
 2926:     mb_first->m_next = mb_copy;
 2927: 
 2928:     mb_first->m_pkthdr.len = len + mb_first->m_len;
 2929:     
 2930:     /*
 2931:      * Fill in the encapsulating IP and PIM header
 2932:      */
 2933:     ip_outer = mtod(mb_first, struct ip *);
 2934:     *ip_outer = pim_encap_iphdr;
 2935: #ifdef RANDOM_IP_ID
 2936:     ip_outer->ip_id = ip_randomid();
 2937: #else
 2938:     ip_outer->ip_id = htons(ip_id++);
 2939: #endif
 2940:     ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
 2941:     ip_outer->ip_src = viftable[vifi].v_lcl_addr;
 2942:     ip_outer->ip_dst = rt->mfc_rp;
 2943:     /*
 2944:      * Copy the inner header TOS to the outer header, and take care of the
 2945:      * IP_DF bit.
 2946:      */
 2947:     ip_outer->ip_tos = ip->ip_tos;
 2948:     if (ntohs(ip->ip_off) & IP_DF)
 2949: 	ip_outer->ip_off |= IP_DF;
 2950:     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 2951: 					 + sizeof(pim_encap_iphdr));
 2952:     *pimhdr = pim_encap_pimhdr;
 2953:     /* If the iif crosses a border, set the Border-bit */
 2954:     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
 2955: 	pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 2956:     
 2957:     mb_first->m_data += sizeof(pim_encap_iphdr);
 2958:     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
 2959:     mb_first->m_data -= sizeof(pim_encap_iphdr);
 2960:     
 2961:     if (vifp->v_rate_limit == 0)
 2962: 	tbf_send_packet(vifp, mb_first);
 2963:     else
 2964: 	tbf_control(vifp, mb_first, ip, ip_outer->ip_len);
 2965:     
 2966:     /* Keep statistics */
 2967:     pimstat.pims_snd_registers_msgs++;
 2968:     pimstat.pims_snd_registers_bytes += len;
 2969:     
 2970:     return 0;
 2971: }
 2972: 
 2973: /*
 2974:  * PIM-SMv2 and PIM-DM messages processing.
 2975:  * Receives and verifies the PIM control messages, and passes them
 2976:  * up to the listening socket, using rip_input().
 2977:  * The only message with special processing is the PIM_REGISTER message
 2978:  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
 2979:  * is passed to if_simloop().
 2980:  */
 2981: void
 2982: pim_input(struct mbuf *m, int off, int proto)
 2983: {
 2984:     struct ip *ip = mtod(m, struct ip *);
 2985:     struct pim *pim;
 2986:     int minlen;
 2987:     int datalen = ip->ip_len;
 2988:     int ip_tos;
 2989:     int iphlen = off;
 2990:     
 2991:     /* Keep statistics */
 2992:     pimstat.pims_rcv_total_msgs++;
 2993:     pimstat.pims_rcv_total_bytes += datalen;
 2994:     
 2995:     /*
 2996:      * Validate lengths
 2997:      */
 2998:     if (datalen < PIM_MINLEN) {
 2999: 	pimstat.pims_rcv_tooshort++;
 3000: 	log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
 3001: 	    datalen, (u_long)ip->ip_src.s_addr);
 3002: 	m_freem(m);
 3003: 	return;
 3004:     }
 3005:     
 3006:     /*
 3007:      * If the packet is at least as big as a REGISTER, go agead
 3008:      * and grab the PIM REGISTER header size, to avoid another
 3009:      * possible m_pullup() later.
 3010:      * 
 3011:      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
 3012:      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
 3013:      */
 3014:     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
 3015:     /*
 3016:      * Get the IP and PIM headers in contiguous memory, and
 3017:      * possibly the PIM REGISTER header.
 3018:      */
 3019:     if ((m->m_flags & M_EXT || m->m_len < minlen) &&
 3020: 	(m = m_pullup(m, minlen)) == 0) {
 3021: 	log(LOG_ERR, "pim_input: m_pullup failure\n");
 3022: 	return;
 3023:     }
 3024:     /* m_pullup() may have given us a new mbuf so reset ip. */
 3025:     ip = mtod(m, struct ip *);
 3026:     ip_tos = ip->ip_tos;
 3027:     
 3028:     /* adjust mbuf to point to the PIM header */
 3029:     m->m_data += iphlen;
 3030:     m->m_len  -= iphlen;
 3031:     pim = mtod(m, struct pim *);
 3032:     
 3033:     /*
 3034:      * Validate checksum. If PIM REGISTER, exclude the data packet.
 3035:      *
 3036:      * XXX: some older PIMv2 implementations don't make this distinction,
 3037:      * so for compatibility reason perform the checksum over part of the
 3038:      * message, and if error, then over the whole message.
 3039:      */
 3040:     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 3041: 	/* do nothing, checksum okay */
 3042:     } else if (in_cksum(m, datalen)) {
 3043: 	pimstat.pims_rcv_badsum++;
 3044: 	if (mrtdebug & DEBUG_PIM)
 3045: 	    log(LOG_DEBUG, "pim_input: invalid checksum");
 3046: 	m_freem(m);
 3047: 	return;
 3048:     }
 3049: 
 3050:     /* PIM version check */
 3051:     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 3052: 	pimstat.pims_rcv_badversion++;
 3053: 	log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
 3054: 	    PIM_VT_V(pim->pim_vt), PIM_VERSION);
 3055: 	m_freem(m);
 3056: 	return;
 3057:     }
 3058:     
 3059:     /* restore mbuf back to the outer IP */
 3060:     m->m_data -= iphlen;
 3061:     m->m_len  += iphlen;
 3062:     
 3063:     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 3064: 	/*
 3065: 	 * Since this is a REGISTER, we'll make a copy of the register
 3066: 	 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 3067: 	 * routing daemon.
 3068: 	 */
 3069: 	struct sockaddr_in dst = { sizeof(dst), AF_INET };
 3070: 	struct mbuf *mcp;
 3071: 	struct ip *encap_ip;
 3072: 	u_int32_t *reghdr;
 3073: 	
 3074: 	if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
 3075: 	    if (mrtdebug & DEBUG_PIM)
 3076: 		log(LOG_DEBUG,
 3077: 		    "pim_input: register vif not set: %d\n", reg_vif_num);
 3078: 	    m_freem(m);
 3079: 	    return;
 3080: 	}
 3081: 	
 3082: 	/*
 3083: 	 * Validate length
 3084: 	 */
 3085: 	if (datalen < PIM_REG_MINLEN) {
 3086: 	    pimstat.pims_rcv_tooshort++;
 3087: 	    pimstat.pims_rcv_badregisters++;
 3088: 	    log(LOG_ERR,
 3089: 		"pim_input: register packet size too small %d from %lx\n",
 3090: 		datalen, (u_long)ip->ip_src.s_addr);
 3091: 	    m_freem(m);
 3092: 	    return;
 3093: 	}
 3094: 	
 3095: 	reghdr = (u_int32_t *)(pim + 1);
 3096: 	encap_ip = (struct ip *)(reghdr + 1);
 3097: 	
 3098: 	if (mrtdebug & DEBUG_PIM) {
 3099: 	    log(LOG_DEBUG,
 3100: 		"pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
 3101: 		(u_long)ntohl(encap_ip->ip_src.s_addr),
 3102: 		(u_long)ntohl(encap_ip->ip_dst.s_addr),
 3103: 		ntohs(encap_ip->ip_len));
 3104: 	}
 3105: 	
 3106: 	/* verify the version number of the inner packet */
 3107: 	if (encap_ip->ip_v != IPVERSION) {
 3108: 	    pimstat.pims_rcv_badregisters++;
 3109: 	    if (mrtdebug & DEBUG_PIM) {
 3110: 		log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
 3111: 		    "of the inner packet\n", encap_ip->ip_v);
 3112: 	    }
 3113: 	    m_freem(m);
 3114: 	    return;
 3115: 	}
 3116: 	
 3117: 	/* verify the inner packet is destined to a mcast group */
 3118: 	if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
 3119: 	    pimstat.pims_rcv_badregisters++;
 3120: 	    if (mrtdebug & DEBUG_PIM)
 3121: 		log(LOG_DEBUG,
 3122: 		    "pim_input: inner packet of register is not "
 3123: 		    "multicast %lx\n",
 3124: 		    (u_long)ntohl(encap_ip->ip_dst.s_addr));
 3125: 	    m_freem(m);
 3126: 	    return;
 3127: 	}
 3128: 
 3129: 	/* If a NULL_REGISTER, pass it to the daemon */
 3130: 	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 3131: 		goto pim_input_to_daemon;
 3132: 
 3133: 	/*
 3134: 	 * Copy the TOS from the outer IP header to the inner IP header.
 3135: 	 */
 3136: 	if (encap_ip->ip_tos != ip_tos) {
 3137: 	    /* Outer TOS -> inner TOS */
 3138: 	    encap_ip->ip_tos = ip_tos;
 3139: 	    /* Recompute the inner header checksum. Sigh... */
 3140: 	    
 3141: 	    /* adjust mbuf to point to the inner IP header */
 3142: 	    m->m_data += (iphlen + PIM_MINLEN);
 3143: 	    m->m_len  -= (iphlen + PIM_MINLEN);
 3144: 	    
 3145: 	    encap_ip->ip_sum = 0;
 3146: 	    encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 3147: 	    
 3148: 	    /* restore mbuf to point back to the outer IP header */
 3149: 	    m->m_data -= (iphlen + PIM_MINLEN);
 3150: 	    m->m_len  += (iphlen + PIM_MINLEN);
 3151: 	}
 3152: 
 3153: 	/*
 3154: 	 * Decapsulate the inner IP packet and loopback to forward it
 3155: 	 * as a normal multicast packet. Also, make a copy of the 
 3156: 	 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 3157: 	 * to pass to the daemon later, so it can take the appropriate
 3158: 	 * actions (e.g., send back PIM_REGISTER_STOP).
 3159: 	 * XXX: here m->m_data points to the outer IP header.
 3160: 	 */
 3161: 	mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
 3162: 	if (mcp == NULL) {
 3163: 	    log(LOG_ERR,
 3164: 		"pim_input: pim register: could not copy register head\n");
 3165: 	    m_freem(m);
 3166: 	    return;
 3167: 	}
 3168: 	
 3169: 	/* Keep statistics */
 3170: 	/* XXX: registers_bytes include only the encap. mcast pkt */
 3171: 	pimstat.pims_rcv_registers_msgs++;
 3172: 	pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
 3173: 	
 3174: 	/*
 3175: 	 * forward the inner ip packet; point m_data at the inner ip.
 3176: 	 */
 3177: 	m_adj(m, iphlen + PIM_MINLEN);
 3178: 	
 3179: 	if (mrtdebug & DEBUG_PIM) {
 3180: 	    log(LOG_DEBUG,
 3181: 		"pim_input: forwarding decapsulated register: "
 3182: 		"src %lx, dst %lx, vif %d\n",
 3183: 		(u_long)ntohl(encap_ip->ip_src.s_addr),
 3184: 		(u_long)ntohl(encap_ip->ip_dst.s_addr),
 3185: 		reg_vif_num);
 3186: 	}
 3187: 	if_simloop(viftable[reg_vif_num].v_ifp, m, dst.sin_family, 0);
 3188: 	
 3189: 	/* prepare the register head to send to the mrouting daemon */
 3190: 	m = mcp;
 3191:     }
 3192: 
 3193: pim_input_to_daemon:    
 3194:     /*
 3195:      * Pass the PIM message up to the daemon; if it is a Register message,
 3196:      * pass the 'head' only up to the daemon. This includes the
 3197:      * outer IP header, PIM header, PIM-Register header and the
 3198:      * inner IP header.
 3199:      * XXX: the outer IP header pkt size of a Register is not adjust to
 3200:      * reflect the fact that the inner multicast data is truncated.
 3201:      */
 3202:     rip_input(m, iphlen, proto);
 3203: 
 3204:     return;
 3205: }
 3206: #endif /* PIM */
 3207: 
 3208: static int
 3209: ip_mroute_modevent(module_t mod, int type, void *unused)
 3210: {
 3211:     int s;
 3212: 
 3213:     switch (type) {
 3214:     case MOD_LOAD:
 3215: 	s = splnet();
 3216: 	/* XXX Protect against multiple loading */
 3217: 	ip_mcast_src = X_ip_mcast_src;
 3218: 	ip_mforward = X_ip_mforward;
 3219: 	ip_mrouter_done = X_ip_mrouter_done;
 3220: 	ip_mrouter_get = X_ip_mrouter_get;
 3221: 	ip_mrouter_set = X_ip_mrouter_set;
 3222: 	ip_rsvp_force_done = X_ip_rsvp_force_done;
 3223: 	ip_rsvp_vif = X_ip_rsvp_vif;
 3224: 	ipip_input = X_ipip_input;
 3225: 	legal_vif_num = X_legal_vif_num;
 3226: 	mrt_ioctl = X_mrt_ioctl;
 3227: 	rsvp_input_p = X_rsvp_input;
 3228: 	splx(s);
 3229: 	break;
 3230: 
 3231:     case MOD_UNLOAD:
 3232: 	if (ip_mrouter)
 3233: 	    return EINVAL;
 3234: 
 3235: 	s = splnet();
 3236: 	ip_mcast_src = NULL;
 3237: 	ip_mforward = NULL;
 3238: 	ip_mrouter_done = NULL;
 3239: 	ip_mrouter_get = NULL;
 3240: 	ip_mrouter_set = NULL;
 3241: 	ip_rsvp_force_done = NULL;
 3242: 	ip_rsvp_vif = NULL;
 3243: 	ipip_input = NULL;
 3244: 	legal_vif_num = NULL;
 3245: 	mrt_ioctl = NULL;
 3246: 	rsvp_input_p = NULL;
 3247: 	splx(s);
 3248: 	break;
 3249:     }
 3250:     return 0;
 3251: }
 3252: 
 3253: static moduledata_t ip_mroutemod = {
 3254:     "ip_mroute",
 3255:     ip_mroute_modevent,
 3256:     0
 3257: };
 3258: DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);