/* $NetBSD: bpf.c,v 1.229.2.4 2024/09/13 14:14:41 martin Exp $ */ /* * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 * static char rcsid[] = * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp "; */ #include __KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.229.2.4 2024/09/13 14:14:41 martin Exp $"); #if defined(_KERNEL_OPT) #include "opt_bpf.h" #include "sl.h" #include "strip.h" #include "opt_net_mpsafe.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef BPF_BUFSIZE /* * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k). */ # define BPF_BUFSIZE 32768 #endif #define PRINET 26 /* interruptible */ /* * The default read buffer size, and limit for BIOCSBLEN, is sysctl'able. * XXX the default values should be computed dynamically based * on available memory size and available mbuf clusters. */ static int bpf_bufsize = BPF_BUFSIZE; static int bpf_maxbufsize = BPF_DFLTBUFSIZE; /* XXX set dynamically, see above */ static bool bpf_jit = false; struct bpfjit_ops bpfjit_module_ops = { .bj_generate_code = NULL, .bj_free_code = NULL }; /* * Global BPF statistics returned by net.bpf.stats sysctl. */ static struct percpu *bpf_gstats_percpu; /* struct bpf_stat */ #define BPF_STATINC(id) \ { \ struct bpf_stat *__stats = \ percpu_getref(bpf_gstats_percpu); \ __stats->bs_##id++; \ percpu_putref(bpf_gstats_percpu); \ } /* * Locking notes: * - bpf_mtx (adaptive mutex) protects: * - Gobal lists: bpf_iflist and bpf_dlist * - struct bpf_if * - bpf_close * - bpf_psz (pserialize) * - struct bpf_d has two mutexes: * - bd_buf_mtx (spin mutex) protects the buffers that can be accessed * on packet tapping * - bd_mtx (adaptive mutex) protects member variables other than the buffers * - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx * - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is * never freed because struct bpf_d is only freed in bpf_close and * bpf_close never be called while executing bpf_read and bpf_write * - A filter that is assigned to bpf_d can be replaced with another filter * while tapping packets, so it needs to be done atomically * - struct bpf_d is iterated on bpf_dlist with psz * - struct bpf_if is iterated on bpf_iflist with psz or psref */ /* * Use a mutex to avoid a race condition between gathering the stats/peers * and opening/closing the device. */ static kmutex_t bpf_mtx; static struct psref_class *bpf_psref_class __read_mostly; static pserialize_t bpf_psz; static inline void bpf_if_acquire(struct bpf_if *bp, struct psref *psref) { psref_acquire(psref, &bp->bif_psref, bpf_psref_class); } static inline void bpf_if_release(struct bpf_if *bp, struct psref *psref) { psref_release(psref, &bp->bif_psref, bpf_psref_class); } /* * bpf_iflist is the list of interfaces; each corresponds to an ifnet * bpf_dtab holds the descriptors, indexed by minor device # */ static struct pslist_head bpf_iflist; static struct pslist_head bpf_dlist; /* Macros for bpf_d on bpf_dlist */ #define BPF_DLIST_WRITER_INSERT_HEAD(__d) \ PSLIST_WRITER_INSERT_HEAD(&bpf_dlist, (__d), bd_bpf_dlist_entry) #define BPF_DLIST_READER_FOREACH(__d) \ PSLIST_READER_FOREACH((__d), &bpf_dlist, struct bpf_d, \ bd_bpf_dlist_entry) #define BPF_DLIST_WRITER_FOREACH(__d) \ PSLIST_WRITER_FOREACH((__d), &bpf_dlist, struct bpf_d, \ bd_bpf_dlist_entry) #define BPF_DLIST_ENTRY_INIT(__d) \ PSLIST_ENTRY_INIT((__d), bd_bpf_dlist_entry) #define BPF_DLIST_WRITER_REMOVE(__d) \ PSLIST_WRITER_REMOVE((__d), bd_bpf_dlist_entry) #define BPF_DLIST_ENTRY_DESTROY(__d) \ PSLIST_ENTRY_DESTROY((__d), bd_bpf_dlist_entry) /* Macros for bpf_if on bpf_iflist */ #define BPF_IFLIST_WRITER_INSERT_HEAD(__bp) \ PSLIST_WRITER_INSERT_HEAD(&bpf_iflist, (__bp), bif_iflist_entry) #define BPF_IFLIST_READER_FOREACH(__bp) \ PSLIST_READER_FOREACH((__bp), &bpf_iflist, struct bpf_if, \ bif_iflist_entry) #define BPF_IFLIST_WRITER_FOREACH(__bp) \ PSLIST_WRITER_FOREACH((__bp), &bpf_iflist, struct bpf_if, \ bif_iflist_entry) #define BPF_IFLIST_WRITER_REMOVE(__bp) \ PSLIST_WRITER_REMOVE((__bp), bif_iflist_entry) #define BPF_IFLIST_ENTRY_INIT(__bp) \ PSLIST_ENTRY_INIT((__bp), bif_iflist_entry) #define BPF_IFLIST_ENTRY_DESTROY(__bp) \ PSLIST_ENTRY_DESTROY((__bp), bif_iflist_entry) /* Macros for bpf_d on bpf_if#bif_dlist_pslist */ #define BPFIF_DLIST_READER_FOREACH(__d, __bp) \ PSLIST_READER_FOREACH((__d), &(__bp)->bif_dlist_head, struct bpf_d, \ bd_bif_dlist_entry) #define BPFIF_DLIST_WRITER_INSERT_HEAD(__bp, __d) \ PSLIST_WRITER_INSERT_HEAD(&(__bp)->bif_dlist_head, (__d), \ bd_bif_dlist_entry) #define BPFIF_DLIST_WRITER_REMOVE(__d) \ PSLIST_WRITER_REMOVE((__d), bd_bif_dlist_entry) #define BPFIF_DLIST_ENTRY_INIT(__d) \ PSLIST_ENTRY_INIT((__d), bd_bif_dlist_entry) #define BPFIF_DLIST_READER_EMPTY(__bp) \ (PSLIST_READER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d, \ bd_bif_dlist_entry) == NULL) #define BPFIF_DLIST_WRITER_EMPTY(__bp) \ (PSLIST_WRITER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d, \ bd_bif_dlist_entry) == NULL) #define BPFIF_DLIST_ENTRY_DESTROY(__d) \ PSLIST_ENTRY_DESTROY((__d), bd_bif_dlist_entry) static int bpf_allocbufs(struct bpf_d *); static void bpf_deliver(struct bpf_if *, void *(*cpfn)(void *, const void *, size_t), void *, u_int, u_int, const u_int); static void bpf_freed(struct bpf_d *); static void bpf_free_filter(struct bpf_filter *); static void bpf_ifname(struct ifnet *, struct ifreq *); static void *bpf_mcpy(void *, const void *, size_t); static int bpf_movein(struct uio *, int, uint64_t, struct mbuf **, struct sockaddr *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static int bpf_setf(struct bpf_d *, struct bpf_program *); static void bpf_timed_out(void *); static inline void bpf_wakeup(struct bpf_d *); static int bpf_hdrlen(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void *(*)(void *, const void *, size_t), struct timespec *); static void reset_d(struct bpf_d *); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static int bpf_read(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int bpf_write(struct file *, off_t *, struct uio *, kauth_cred_t, int); static int bpf_ioctl(struct file *, u_long, void *); static int bpf_poll(struct file *, int); static int bpf_stat(struct file *, struct stat *); static int bpf_close(struct file *); static int bpf_kqfilter(struct file *, struct knote *); static void bpf_softintr(void *); static const struct fileops bpf_fileops = { .fo_name = "bpf", .fo_read = bpf_read, .fo_write = bpf_write, .fo_ioctl = bpf_ioctl, .fo_fcntl = fnullop_fcntl, .fo_poll = bpf_poll, .fo_stat = bpf_stat, .fo_close = bpf_close, .fo_kqfilter = bpf_kqfilter, .fo_restart = fnullop_restart, }; dev_type_open(bpfopen); const struct cdevsw bpf_cdevsw = { .d_open = bpfopen, .d_close = noclose, .d_read = noread, .d_write = nowrite, .d_ioctl = noioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; bpfjit_func_t bpf_jit_generate(bpf_ctx_t *bc, void *code, size_t size) { membar_consumer(); if (bpfjit_module_ops.bj_generate_code != NULL) { return bpfjit_module_ops.bj_generate_code(bc, code, size); } return NULL; } void bpf_jit_freecode(bpfjit_func_t jcode) { KASSERT(bpfjit_module_ops.bj_free_code != NULL); bpfjit_module_ops.bj_free_code(jcode); } static int bpf_movein(struct uio *uio, int linktype, uint64_t mtu, struct mbuf **mp, struct sockaddr *sockp) { struct mbuf *m; int error; size_t len; size_t hlen; size_t align; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; align = 0; break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 0; align = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ /* 6(dst)+6(src)+2(type) */ hlen = sizeof(struct ether_header); align = 2; break; case DLT_ARCNET: sockp->sa_family = AF_UNSPEC; hlen = ARC_HDRLEN; align = 5; break; case DLT_FDDI: sockp->sa_family = AF_LINK; /* XXX 4(FORMAC)+6(dst)+6(src) */ hlen = 16; align = 0; break; case DLT_ECONET: sockp->sa_family = AF_UNSPEC; hlen = 6; align = 2; break; case DLT_NULL: sockp->sa_family = AF_UNSPEC; hlen = 0; align = 0; break; default: return (EIO); } len = uio->uio_resid; /* * If there aren't enough bytes for a link level header or the * packet length exceeds the interface mtu, return an error. */ if (len - hlen > mtu) return (EMSGSIZE); /* * XXX Avoid complicated buffer chaining --- * bail if it won't fit in a single mbuf. * (Take into account possible alignment bytes) */ if (len + align > MCLBYTES) return (EIO); m = m_gethdr(M_WAIT, MT_DATA); m_reset_rcvif(m); m->m_pkthdr.len = (int)(len - hlen); if (len + align > MHLEN) { m_clget(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) { error = ENOBUFS; goto bad; } } /* Insure the data is properly aligned */ if (align > 0) { m->m_data += align; m->m_len -= (int)align; } error = uiomove(mtod(m, void *), len, uio); if (error) goto bad; if (hlen != 0) { memcpy(sockp->sa_data, mtod(m, void *), hlen); m->m_data += hlen; /* XXX */ len -= hlen; } m->m_len = (int)len; *mp = m; return (0); bad: m_freem(m); return (error); } /* * Attach file to the bpf interface, i.e. make d listen on bp. */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { KASSERT(mutex_owned(&bpf_mtx)); KASSERT(mutex_owned(d->bd_mtx)); /* * Point d at bp, and add d to the interface's list of listeners. * Finally, point the driver's bpf cookie at the interface so * it will divert packets to bpf. */ d->bd_bif = bp; BPFIF_DLIST_WRITER_INSERT_HEAD(bp, d); *bp->bif_driverp = bp; } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { struct bpf_if *bp; KASSERT(mutex_owned(&bpf_mtx)); KASSERT(mutex_owned(d->bd_mtx)); bp = d->bd_bif; /* * Check if this descriptor had requested promiscuous mode. * If so, turn it off. */ if (d->bd_promisc) { int error __diagused; d->bd_promisc = 0; /* * Take device out of promiscuous mode. Since we were * able to enter promiscuous mode, we should be able * to turn it off. But we can get an error if * the interface was configured down, so only panic * if we don't get an unexpected error. */ KERNEL_LOCK_UNLESS_NET_MPSAFE(); error = ifpromisc(bp->bif_ifp, 0); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); #ifdef DIAGNOSTIC if (error) printf("%s: ifpromisc failed: %d", __func__, error); #endif } /* Remove d from the interface's descriptor list. */ BPFIF_DLIST_WRITER_REMOVE(d); pserialize_perform(bpf_psz); if (BPFIF_DLIST_WRITER_EMPTY(bp)) { /* * Let the driver know that there are no more listeners. */ *d->bd_bif->bif_driverp = NULL; } d->bd_bif = NULL; } static void bpf_init(void) { mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE); bpf_psz = pserialize_create(); bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET); PSLIST_INIT(&bpf_iflist); PSLIST_INIT(&bpf_dlist); bpf_gstats_percpu = percpu_alloc(sizeof(struct bpf_stat)); return; } /* * bpfilterattach() is called at boot time. We don't need to do anything * here, since any initialization will happen as part of module init code. */ /* ARGSUSED */ void bpfilterattach(int n) { } /* * Open ethernet device. Clones. */ /* ARGSUSED */ int bpfopen(dev_t dev, int flag, int mode, struct lwp *l) { struct bpf_d *d; struct file *fp; int error, fd; /* falloc() will fill in the descriptor for us. */ if ((error = fd_allocfile(&fp, &fd)) != 0) return error; d = kmem_zalloc(sizeof(*d), KM_SLEEP); d->bd_bufsize = bpf_bufsize; d->bd_direction = BPF_D_INOUT; d->bd_feedback = 0; d->bd_pid = l->l_proc->p_pid; #ifdef _LP64 if (curproc->p_flag & PK_32) d->bd_compat32 = 1; #endif getnanotime(&d->bd_btime); d->bd_atime = d->bd_mtime = d->bd_btime; callout_init(&d->bd_callout, CALLOUT_MPSAFE); selinit(&d->bd_sel); d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d); d->bd_jitcode = NULL; d->bd_filter = NULL; BPF_DLIST_ENTRY_INIT(d); BPFIF_DLIST_ENTRY_INIT(d); d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET); d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET); cv_init(&d->bd_cv, "bpf"); mutex_enter(&bpf_mtx); BPF_DLIST_WRITER_INSERT_HEAD(d); mutex_exit(&bpf_mtx); return fd_clone(fp, fd, flag, &bpf_fileops, d); } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ /* ARGSUSED */ static int bpf_close(struct file *fp) { struct bpf_d *d; mutex_enter(&bpf_mtx); if ((d = fp->f_bpf) == NULL) { mutex_exit(&bpf_mtx); return 0; } /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; mutex_enter(d->bd_buf_mtx); if (d->bd_state == BPF_WAITING) callout_halt(&d->bd_callout, d->bd_buf_mtx); d->bd_state = BPF_IDLE; mutex_exit(d->bd_buf_mtx); mutex_enter(d->bd_mtx); if (d->bd_bif) bpf_detachd(d); mutex_exit(d->bd_mtx); BPF_DLIST_WRITER_REMOVE(d); pserialize_perform(bpf_psz); mutex_exit(&bpf_mtx); BPFIF_DLIST_ENTRY_DESTROY(d); BPF_DLIST_ENTRY_DESTROY(d); fp->f_bpf = NULL; bpf_freed(d); callout_destroy(&d->bd_callout); seldestroy(&d->bd_sel); softint_disestablish(d->bd_sih); mutex_obj_free(d->bd_mtx); mutex_obj_free(d->bd_buf_mtx); cv_destroy(&d->bd_cv); kmem_free(d, sizeof(*d)); return (0); } /* * Rotate the packet buffers in descriptor d. Move the store buffer * into the hold slot, and the free buffer into the store slot. * Zero the length of the new store buffer. */ #define ROTATE_BUFFERS(d) \ (d)->bd_hbuf = (d)->bd_sbuf; \ (d)->bd_hlen = (d)->bd_slen; \ (d)->bd_sbuf = (d)->bd_fbuf; \ (d)->bd_slen = 0; \ (d)->bd_fbuf = NULL; /* * bpfread - read next chunk of packets from buffers */ static int bpf_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { struct bpf_d *d = fp->f_bpf; int timed_out; int error; getnanotime(&d->bd_atime); /* * Restrict application to use a buffer the same size as * the kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); mutex_enter(d->bd_buf_mtx); if (d->bd_state == BPF_WAITING) callout_halt(&d->bd_callout, d->bd_buf_mtx); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; mutex_exit(d->bd_buf_mtx); /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ mutex_enter(d->bd_buf_mtx); while (d->bd_hbuf == NULL) { if (fp->f_flag & FNONBLOCK) { if (d->bd_slen == 0) { error = EWOULDBLOCK; goto out; } ROTATE_BUFFERS(d); break; } if ((d->bd_immediate || timed_out) && d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. * Rotate the buffers and return what's here. */ ROTATE_BUFFERS(d); break; } error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout); if (error == EINTR || error == ERESTART) goto out; if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { error = 0; goto out; } ROTATE_BUFFERS(d); break; } if (error != 0) goto out; } /* * At this point, we know we have something in the hold slot. */ mutex_exit(d->bd_buf_mtx); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. */ error = uiomove(d->bd_hbuf, d->bd_hlen, uio); mutex_enter(d->bd_buf_mtx); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; out: mutex_exit(d->bd_buf_mtx); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static inline void bpf_wakeup(struct bpf_d *d) { KASSERT(mutex_owned(d->bd_buf_mtx)); cv_broadcast(&d->bd_cv); if (d->bd_async) softint_schedule(d->bd_sih); selnotify(&d->bd_sel, 0, NOTE_SUBMIT); } static void bpf_softintr(void *cookie) { struct bpf_d *d; d = cookie; if (d->bd_async) fownsignal(d->bd_pgid, SIGIO, 0, 0, NULL); } static void bpf_timed_out(void *arg) { struct bpf_d *d = arg; mutex_enter(d->bd_buf_mtx); if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } mutex_exit(d->bd_buf_mtx); } static int bpf_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, int flags) { struct bpf_d *d = fp->f_bpf; struct bpf_if *bp; struct ifnet *ifp; struct mbuf *m, *mc; int error; static struct sockaddr_storage dst; struct psref psref; int bound; m = NULL; /* XXX gcc */ bound = curlwp_bind(); mutex_enter(d->bd_mtx); bp = d->bd_bif; if (bp == NULL) { mutex_exit(d->bd_mtx); error = ENXIO; goto out_bindx; } bpf_if_acquire(bp, &psref); mutex_exit(d->bd_mtx); getnanotime(&d->bd_mtime); ifp = bp->bif_ifp; if (if_is_deactivated(ifp)) { error = ENXIO; goto out; } if (uio->uio_resid == 0) { error = 0; goto out; } error = bpf_movein(uio, (int)bp->bif_dlt, ifp->if_mtu, &m, (struct sockaddr *) &dst); if (error) goto out; if (m->m_pkthdr.len > ifp->if_mtu) { m_freem(m); error = EMSGSIZE; goto out; } if (d->bd_hdrcmplt) dst.ss_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, 0, M_COPYALL, M_NOWAIT); if (mc != NULL) m_set_rcvif(mc, ifp); /* Set M_PROMISC for outgoing packets to be discarded. */ if (1 /*d->bd_direction == BPF_D_INOUT*/) m->m_flags |= M_PROMISC; } else mc = NULL; error = if_output_lock(ifp, ifp, m, (struct sockaddr *) &dst, NULL); if (mc != NULL) { if (error == 0) { int s = splsoftnet(); KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp); ifp->_if_input(ifp, mc); KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp); splx(s); } else m_freem(mc); } /* * The driver frees the mbuf. */ out: bpf_if_release(bp, &psref); out_bindx: curlwp_bindx(bound); return error; } /* * Reset a descriptor by flushing its packet buffer and clearing the * receive and drop counts. */ static void reset_d(struct bpf_d *d) { KASSERT(mutex_owned(d->bd_mtx)); mutex_enter(d->bd_buf_mtx); if (d->bd_hbuf) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; } d->bd_slen = 0; d->bd_hlen = 0; d->bd_rcount = 0; d->bd_dcount = 0; d->bd_ccount = 0; mutex_exit(d->bd_buf_mtx); } /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set ethernet read filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag. * BIOCSHDRCMPLT Set "header already complete" flag. * BIOCSFEEDBACK Set packet feedback mode. * BIOCGFEEDBACK Get packet feedback mode. * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag */ /* ARGSUSED */ static int bpf_ioctl(struct file *fp, u_long cmd, void *addr) { struct bpf_d *d = fp->f_bpf; int error = 0; /* * Refresh the PID associated with this bpf file. */ d->bd_pid = curproc->p_pid; #ifdef _LP64 if (curproc->p_flag & PK_32) d->bd_compat32 = 1; else d->bd_compat32 = 0; #endif mutex_enter(d->bd_buf_mtx); if (d->bd_state == BPF_WAITING) callout_halt(&d->bd_callout, d->bd_buf_mtx); d->bd_state = BPF_IDLE; mutex_exit(d->bd_buf_mtx); switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; mutex_enter(d->bd_buf_mtx); n = d->bd_slen; if (d->bd_hbuf) n += d->bd_hlen; mutex_exit(d->bd_buf_mtx); *(int *)addr = n; break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: *(u_int *)addr = d->bd_bufsize; break; /* * Set buffer length. */ case BIOCSBLEN: /* * Forbid to change the buffer length if buffers are already * allocated. */ mutex_enter(d->bd_mtx); mutex_enter(d->bd_buf_mtx); if (d->bd_bif != NULL || d->bd_sbuf != NULL) error = EINVAL; else { u_int size = *(u_int *)addr; if (size > bpf_maxbufsize) *(u_int *)addr = size = bpf_maxbufsize; else if (size < BPF_MINBUFSIZE) *(u_int *)addr = size = BPF_MINBUFSIZE; d->bd_bufsize = size; } mutex_exit(d->bd_buf_mtx); mutex_exit(d->bd_mtx); break; /* * Set link layer read filter. */ case BIOCSETF: error = bpf_setf(d, addr); break; /* * Flush read packet buffer. */ case BIOCFLUSH: mutex_enter(d->bd_mtx); reset_d(d); mutex_exit(d->bd_mtx); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) { mutex_exit(d->bd_mtx); /* * No interface attached yet. */ error = EINVAL; break; } if (d->bd_promisc == 0) { KERNEL_LOCK_UNLESS_NET_MPSAFE(); error = ifpromisc(d->bd_bif->bif_ifp, 1); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); if (error == 0) d->bd_promisc = 1; } mutex_exit(d->bd_mtx); break; /* * Get device parameters. */ case BIOCGDLT: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; mutex_exit(d->bd_mtx); break; /* * Get a list of supported device parameters. */ case BIOCGDLTLIST: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, addr); mutex_exit(d->bd_mtx); break; /* * Set device parameters. */ case BIOCSDLT: mutex_enter(&bpf_mtx); mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); mutex_exit(d->bd_mtx); mutex_exit(&bpf_mtx); break; /* * Set interface name. */ #ifdef OBIOCGETIF case OBIOCGETIF: #endif case BIOCGETIF: mutex_enter(d->bd_mtx); if (d->bd_bif == NULL) error = EINVAL; else bpf_ifname(d->bd_bif->bif_ifp, addr); mutex_exit(d->bd_mtx); break; /* * Set interface. */ #ifdef OBIOCSETIF case OBIOCSETIF: #endif case BIOCSETIF: mutex_enter(&bpf_mtx); error = bpf_setif(d, addr); mutex_exit(&bpf_mtx); break; /* * Set read timeout. */ case BIOCSRTIMEOUT: { struct timeval *tv = addr; /* Compute number of ticks. */ if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) { error = EINVAL; break; } else if (tv->tv_sec > INT_MAX/hz - 1) { d->bd_rtout = INT_MAX; } else { d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick; } if ((d->bd_rtout == 0) && (tv->tv_usec != 0)) d->bd_rtout = 1; break; } #ifdef BIOCGORTIMEOUT /* * Get read timeout. */ case BIOCGORTIMEOUT: { struct timeval50 *tv = addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; } #endif #ifdef BIOCSORTIMEOUT /* * Set read timeout. */ case BIOCSORTIMEOUT: { struct timeval50 *tv = addr; /* Compute number of ticks. */ if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) { error = EINVAL; break; } else if (tv->tv_sec > INT_MAX/hz - 1) { d->bd_rtout = INT_MAX; } else { d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick; } if ((d->bd_rtout == 0) && (tv->tv_usec != 0)) d->bd_rtout = 1; break; } #endif /* * Get read timeout. */ case BIOCGRTIMEOUT: { struct timeval *tv = addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = addr; bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; bs->bs_capt = d->bd_ccount; break; } case BIOCGSTATSOLD: { struct bpf_stat_old *bs = addr; bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: d->bd_immediate = *(u_int *)addr; break; case BIOCVERSION: { struct bpf_version *bv = addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } case BIOCGHDRCMPLT: /* get "header already complete" flag */ *(u_int *)addr = d->bd_hdrcmplt; break; case BIOCSHDRCMPLT: /* set "header already complete" flag */ d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; break; /* * Get packet direction flag */ case BIOCGDIRECTION: *(u_int *)addr = d->bd_direction; break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: d->bd_direction = direction; break; default: error = EINVAL; } } break; /* * Set "feed packets from bpf back to input" mode */ case BIOCSFEEDBACK: d->bd_feedback = *(u_int *)addr; break; /* * Get "feed packets from bpf back to input" mode */ case BIOCGFEEDBACK: *(u_int *)addr = d->bd_feedback; break; case FIONBIO: /* Non-blocking I/O */ /* * No need to do anything special as we use IO_NDELAY in * bpfread() as an indication of whether or not to block * the read. */ break; case FIOASYNC: /* Send signal on receive packets */ mutex_enter(d->bd_mtx); d->bd_async = *(int *)addr; mutex_exit(d->bd_mtx); break; case TIOCSPGRP: /* Process or group to send signals to */ case FIOSETOWN: error = fsetown(&d->bd_pgid, cmd, addr); break; case TIOCGPGRP: case FIOGETOWN: error = fgetown(d->bd_pgid, cmd, addr); break; } return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp) { struct bpf_insn *fcode; bpfjit_func_t jcode; size_t flen, size = 0; struct bpf_filter *oldf, *newf; jcode = NULL; flen = fp->bf_len; if ((fp->bf_insns == NULL && flen) || flen > BPF_MAXINSNS) { return EINVAL; } if (flen) { /* * Allocate the buffer, copy the byte-code from * userspace and validate it. */ size = flen * sizeof(*fp->bf_insns); fcode = kmem_alloc(size, KM_SLEEP); if (copyin(fp->bf_insns, fcode, size) != 0 || !bpf_validate(fcode, (int)flen)) { kmem_free(fcode, size); return EINVAL; } membar_consumer(); if (bpf_jit) jcode = bpf_jit_generate(NULL, fcode, flen); } else { fcode = NULL; } newf = kmem_alloc(sizeof(*newf), KM_SLEEP); newf->bf_insn = fcode; newf->bf_size = size; newf->bf_jitcode = jcode; d->bd_jitcode = jcode; /* XXX just for kvm(3) users */ /* Need to hold bpf_mtx for pserialize_perform */ mutex_enter(&bpf_mtx); mutex_enter(d->bd_mtx); oldf = d->bd_filter; d->bd_filter = newf; membar_producer(); reset_d(d); pserialize_perform(bpf_psz); mutex_exit(d->bd_mtx); mutex_exit(&bpf_mtx); if (oldf != NULL) bpf_free_filter(oldf); return 0; } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; char *cp; int unit_seen, i, error; KASSERT(mutex_owned(&bpf_mtx)); /* * Make sure the provided name has a unit number, and default * it to '0' if not specified. * XXX This is ugly ... do this differently? */ unit_seen = 0; cp = ifr->ifr_name; cp[sizeof(ifr->ifr_name) - 1] = '\0'; /* sanity */ while (*cp++) if (*cp >= '0' && *cp <= '9') unit_seen = 1; if (!unit_seen) { /* Make sure to leave room for the '\0'. */ for (i = 0; i < (IFNAMSIZ - 1); ++i) { if ((ifr->ifr_name[i] >= 'a' && ifr->ifr_name[i] <= 'z') || (ifr->ifr_name[i] >= 'A' && ifr->ifr_name[i] <= 'Z')) continue; ifr->ifr_name[i] = '0'; } } /* * Look through attached interfaces for the named one. */ BPF_IFLIST_WRITER_FOREACH(bp) { struct ifnet *ifp = bp->bif_ifp; if (ifp == NULL || strcmp(ifp->if_xname, ifr->ifr_name) != 0) continue; /* skip additional entry */ if (bp->bif_driverp != &ifp->if_bpf) continue; /* * We found the requested interface. * Allocate the packet buffers if we need to. * If we're already attached to requested interface, * just flush the buffer. */ /* * bpf_allocbufs is called only here. bpf_mtx ensures that * no race condition happen on d->bd_sbuf. */ if (d->bd_sbuf == NULL) { error = bpf_allocbufs(d); if (error != 0) return (error); } mutex_enter(d->bd_mtx); if (bp != d->bd_bif) { if (d->bd_bif) { /* * Detach if attached to something else. */ bpf_detachd(d); BPFIF_DLIST_ENTRY_INIT(d); } bpf_attachd(d, bp); } reset_d(d); mutex_exit(d->bd_mtx); return (0); } /* Not found. */ return (ENXIO); } /* * Copy the interface name to the ifreq. */ static void bpf_ifname(struct ifnet *ifp, struct ifreq *ifr) { memcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); } static int bpf_stat(struct file *fp, struct stat *st) { struct bpf_d *d = fp->f_bpf; (void)memset(st, 0, sizeof(*st)); mutex_enter(d->bd_mtx); st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid); st->st_atimespec = d->bd_atime; st->st_mtimespec = d->bd_mtime; st->st_ctimespec = st->st_birthtimespec = d->bd_btime; st->st_uid = kauth_cred_geteuid(fp->f_cred); st->st_gid = kauth_cred_getegid(fp->f_cred); st->st_mode = S_IFCHR; mutex_exit(d->bd_mtx); return 0; } /* * Support for poll() system call * * Return true iff the specific operation will not block indefinitely - with * the assumption that it is safe to positively acknowledge a request for the * ability to write to the BPF device. * Otherwise, return false but make a note that a selnotify() must be done. */ static int bpf_poll(struct file *fp, int events) { struct bpf_d *d = fp->f_bpf; int revents; /* * Refresh the PID associated with this bpf file. */ mutex_enter(&bpf_mtx); d->bd_pid = curproc->p_pid; revents = events & (POLLOUT | POLLWRNORM); if (events & (POLLIN | POLLRDNORM)) { /* * An imitation of the FIONREAD ioctl code. */ mutex_enter(d->bd_mtx); mutex_enter(d->bd_buf_mtx); if (d->bd_hlen != 0 || ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0)) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(curlwp, &d->bd_sel); /* Start the read timeout if necessary */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } mutex_exit(d->bd_buf_mtx); mutex_exit(d->bd_mtx); } mutex_exit(&bpf_mtx); return (revents); } static void filt_bpfrdetach(struct knote *kn) { struct bpf_d *d = kn->kn_hook; mutex_enter(d->bd_buf_mtx); SLIST_REMOVE(&d->bd_sel.sel_klist, kn, knote, kn_selnext); mutex_exit(d->bd_buf_mtx); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = kn->kn_hook; int rv; if (hint & NOTE_SUBMIT) KASSERT(mutex_owned(d->bd_buf_mtx)); else mutex_enter(d->bd_buf_mtx); kn->kn_data = d->bd_hlen; if (d->bd_immediate) kn->kn_data += d->bd_slen; rv = (kn->kn_data > 0); if (hint & NOTE_SUBMIT) KASSERT(mutex_owned(d->bd_buf_mtx)); else mutex_exit(d->bd_buf_mtx); return rv; } static const struct filterops bpfread_filtops = { .f_isfd = 1, .f_attach = NULL, .f_detach = filt_bpfrdetach, .f_event = filt_bpfread, }; static int bpf_kqfilter(struct file *fp, struct knote *kn) { struct bpf_d *d = fp->f_bpf; struct klist *klist; mutex_enter(d->bd_buf_mtx); switch (kn->kn_filter) { case EVFILT_READ: klist = &d->bd_sel.sel_klist; kn->kn_fop = &bpfread_filtops; break; default: mutex_exit(d->bd_buf_mtx); return (EINVAL); } kn->kn_hook = d; SLIST_INSERT_HEAD(klist, kn, kn_selnext); mutex_exit(d->bd_buf_mtx); return (0); } /* * Copy data from an mbuf chain into a buffer. This code is derived * from m_copydata in sys/uipc_mbuf.c. */ static void * bpf_mcpy(void *dst_arg, const void *src_arg, size_t len) { const struct mbuf *m; u_int count; u_char *dst; m = src_arg; dst = dst_arg; while (len > 0) { if (m == NULL) panic("bpf_mcpy"); count = uimin(m->m_len, len); memcpy(dst, mtod(m, const void *), count); m = m->m_next; dst += count; len -= count; } return dst_arg; } /* * Dispatch a packet to all the listeners on interface bp. * * pkt pointer to the packet, either a data buffer or an mbuf chain * buflen buffer length, if pkt is a data buffer * cpfn a function that can copy pkt into the listener's buffer * pktlen length of the packet * direction BPF_D_IN or BPF_D_OUT */ static inline void bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t), void *pkt, u_int pktlen, u_int buflen, const u_int direction) { uint32_t mem[BPF_MEMWORDS]; bpf_args_t args = { .pkt = (const uint8_t *)pkt, .wirelen = pktlen, .buflen = buflen, .mem = mem, .arg = NULL }; bool gottime = false; struct timespec ts; struct bpf_d *d; int s; KASSERT(!cpu_intr_p()); /* * Note that the IPL does not have to be raised at this point. * The only problem that could arise here is that if two different * interfaces shared any data. This is not the case. */ s = pserialize_read_enter(); BPFIF_DLIST_READER_FOREACH(d, bp) { u_int slen = 0; struct bpf_filter *filter; if (direction == BPF_D_IN) { if (d->bd_direction == BPF_D_OUT) continue; } else { /* BPF_D_OUT */ if (d->bd_direction == BPF_D_IN) continue; } atomic_inc_ulong(&d->bd_rcount); BPF_STATINC(recv); filter = d->bd_filter; membar_datadep_consumer(); if (filter != NULL) { if (filter->bf_jitcode != NULL) slen = filter->bf_jitcode(NULL, &args); else slen = bpf_filter_ext(NULL, filter->bf_insn, &args); } else { slen = (u_int)-1; /* No filter means accept all */ } if (!slen) { continue; } if (!gottime) { gottime = true; nanotime(&ts); } /* Assume catchpacket doesn't sleep */ catchpacket(d, pkt, pktlen, slen, cpfn, &ts); } pserialize_read_exit(s); } /* * Incoming linkage from device drivers, when the head of the packet is in * a buffer, and the tail is in an mbuf chain. */ static void _bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m, u_int direction) { u_int pktlen; struct mbuf mb; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m) + dlen; /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ (void)memset(&mb, 0, sizeof(mb)); mb.m_type = MT_DATA; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, direction); } /* * Incoming linkage from device drivers, when packet is in an mbuf chain. */ static void _bpf_mtap(struct bpf_if *bp, struct mbuf *m, u_int direction) { void *(*cpfn)(void *, const void *, size_t); u_int pktlen, buflen; void *marg; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m); /* Skip zero-sized packets. */ if (__predict_false(pktlen == 0)) { return; } if (pktlen == m->m_len) { cpfn = (void *)memcpy; marg = mtod(m, void *); buflen = pktlen; KASSERT(buflen != 0); } else { cpfn = bpf_mcpy; marg = m; buflen = 0; } bpf_deliver(bp, cpfn, marg, pktlen, buflen, direction); } /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ static void _bpf_mtap_af(struct bpf_if *bp, uint32_t af, struct mbuf *m, u_int direction) { struct mbuf m0; m0.m_type = MT_DATA; m0.m_flags = 0; m0.m_next = m; m0.m_nextpkt = NULL; m0.m_owner = NULL; m0.m_len = 4; m0.m_data = (char *)⁡ _bpf_mtap(bp, &m0, direction); } /* * Put the SLIP pseudo-"link header" in place. * Note this M_PREPEND() should never fail, * swince we know we always have enough space * in the input buffer. */ static void _bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m) { u_char *hp; M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT); if (*m == NULL) return; hp = mtod(*m, u_char *); hp[SLX_DIR] = SLIPDIR_IN; (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN); _bpf_mtap(bp, *m, BPF_D_IN); m_adj(*m, SLIP_HDRLEN); } /* * Put the SLIP pseudo-"link header" in * place. The compressed header is now * at the beginning of the mbuf. */ static void _bpf_mtap_sl_out(struct bpf_if *bp, u_char *chdr, struct mbuf *m) { struct mbuf m0; u_char *hp; m0.m_type = MT_DATA; m0.m_flags = 0; m0.m_next = m; m0.m_nextpkt = NULL; m0.m_owner = NULL; m0.m_data = m0.m_dat; m0.m_len = SLIP_HDRLEN; hp = mtod(&m0, u_char *); hp[SLX_DIR] = SLIPDIR_OUT; (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN); _bpf_mtap(bp, &m0, BPF_D_OUT); m_freem(m); } static struct mbuf * bpf_mbuf_enqueue(struct bpf_if *bp, struct mbuf *m) { struct mbuf *dup; dup = m_dup(m, 0, M_COPYALL, M_NOWAIT); if (dup == NULL) return NULL; if (bp->bif_mbuf_tail != NULL) { bp->bif_mbuf_tail->m_nextpkt = dup; } else { bp->bif_mbuf_head = dup; } bp->bif_mbuf_tail = dup; #ifdef BPF_MTAP_SOFTINT_DEBUG log(LOG_DEBUG, "%s: enqueued mbuf=%p to %s\n", __func__, dup, bp->bif_ifp->if_xname); #endif return dup; } static struct mbuf * bpf_mbuf_dequeue(struct bpf_if *bp) { struct mbuf *m; int s; /* XXX NOMPSAFE: assumed running on one CPU */ s = splnet(); m = bp->bif_mbuf_head; if (m != NULL) { bp->bif_mbuf_head = m->m_nextpkt; m->m_nextpkt = NULL; if (bp->bif_mbuf_head == NULL) bp->bif_mbuf_tail = NULL; #ifdef BPF_MTAP_SOFTINT_DEBUG log(LOG_DEBUG, "%s: dequeued mbuf=%p from %s\n", __func__, m, bp->bif_ifp->if_xname); #endif } splx(s); return m; } static void bpf_mtap_si(void *arg) { struct bpf_if *bp = arg; struct mbuf *m; while ((m = bpf_mbuf_dequeue(bp)) != NULL) { #ifdef BPF_MTAP_SOFTINT_DEBUG log(LOG_DEBUG, "%s: tapping mbuf=%p on %s\n", __func__, m, bp->bif_ifp->if_xname); #endif bpf_ops->bpf_mtap(bp, m, BPF_D_IN); m_freem(m); } } static void _bpf_mtap_softint(struct ifnet *ifp, struct mbuf *m) { struct bpf_if *bp = ifp->if_bpf; struct mbuf *dup; KASSERT(cpu_intr_p()); /* To avoid extra invocations of the softint */ if (BPFIF_DLIST_READER_EMPTY(bp)) return; KASSERT(bp->bif_si != NULL); dup = bpf_mbuf_enqueue(bp, m); if (dup != NULL) softint_schedule(bp->bif_si); } static int bpf_hdrlen(struct bpf_d *d) { int hdrlen = d->bd_bif->bif_hdrlen; /* * Compute the length of the bpf header. This is not necessarily * equal to SIZEOF_BPF_HDR because we want to insert spacing such * that the network layer header begins on a longword boundary (for * performance reasons and to alleviate alignment restrictions). */ #ifdef _LP64 if (d->bd_compat32) return (BPF_WORDALIGN32(hdrlen + SIZEOF_BPF_HDR32) - hdrlen); else #endif return (BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen); } /* * Move the packet data from interface memory (pkt) into the * store buffer. Call the wakeup functions if it's time to wakeup * a listener (buffer full), "cpfn" is the routine called to do the * actual data transfer. memcpy is passed in to copy contiguous chunks, * while bpf_mcpy is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void *(*cpfn)(void *, const void *, size_t), struct timespec *ts) { char *h; int totlen, curlen, caplen; int hdrlen = bpf_hdrlen(d); int do_wakeup = 0; atomic_inc_ulong(&d->bd_ccount); BPF_STATINC(capt); /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ totlen = hdrlen + uimin(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * If we adjusted totlen to fit the bufsize, it could be that * totlen is smaller than hdrlen because of the link layer header. */ caplen = totlen - hdrlen; if (caplen < 0) caplen = 0; mutex_enter(d->bd_buf_mtx); /* * Round up the end of the previous packet to the next longword. */ #ifdef _LP64 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize) { /* * This packet will overflow the storage buffer. * Rotate the buffers if we can, then wakeup any * pending reads. */ if (d->bd_fbuf == NULL) { mutex_exit(d->bd_buf_mtx); /* * We haven't completed the previous read yet, * so drop the packet. */ atomic_inc_ulong(&d->bd_dcount); BPF_STATINC(drop); return; } ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) { /* * Immediate mode is set, or the read timeout has * already expired during a select call. A packet * arrived, so the reader should be woken up. */ do_wakeup = 1; } /* * Append the bpf header. */ h = (char *)d->bd_sbuf + curlen; #ifdef _LP64 if (d->bd_compat32) { struct bpf_hdr32 *hp32; hp32 = (struct bpf_hdr32 *)h; hp32->bh_tstamp.tv_sec = ts->tv_sec; hp32->bh_tstamp.tv_usec = ts->tv_nsec / 1000; hp32->bh_datalen = pktlen; hp32->bh_hdrlen = hdrlen; hp32->bh_caplen = caplen; } else #endif { struct bpf_hdr *hp; hp = (struct bpf_hdr *)h; hp->bh_tstamp.tv_sec = ts->tv_sec; hp->bh_tstamp.tv_usec = ts->tv_nsec / 1000; hp->bh_datalen = pktlen; hp->bh_hdrlen = hdrlen; hp->bh_caplen = caplen; } /* * Copy the packet data into the store buffer and update its length. */ (*cpfn)(h + hdrlen, pkt, caplen); d->bd_slen = curlen + totlen; /* * Call bpf_wakeup after bd_slen has been updated so that kevent(2) * will cause filt_bpfread() to be called with it adjusted. */ if (do_wakeup) bpf_wakeup(d); mutex_exit(d->bd_buf_mtx); } /* * Initialize all nonzero fields of a descriptor. */ static int bpf_allocbufs(struct bpf_d *d) { d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); if (!d->bd_fbuf) return (ENOBUFS); d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); if (!d->bd_sbuf) { kmem_free(d->bd_fbuf, d->bd_bufsize); return (ENOBUFS); } d->bd_slen = 0; d->bd_hlen = 0; return (0); } static void bpf_free_filter(struct bpf_filter *filter) { KASSERT(filter != NULL); KASSERT(filter->bf_insn != NULL); kmem_free(filter->bf_insn, filter->bf_size); if (filter->bf_jitcode != NULL) bpf_jit_freecode(filter->bf_jitcode); kmem_free(filter, sizeof(*filter)); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ if (d->bd_sbuf != NULL) { kmem_free(d->bd_sbuf, d->bd_bufsize); if (d->bd_hbuf != NULL) kmem_free(d->bd_hbuf, d->bd_bufsize); if (d->bd_fbuf != NULL) kmem_free(d->bd_fbuf, d->bd_bufsize); } if (d->bd_filter != NULL) { bpf_free_filter(d->bd_filter); d->bd_filter = NULL; } d->bd_jitcode = NULL; } /* * Attach an interface to bpf. dlt is the link layer type; * hdrlen is the fixed size of the link header for the specified dlt * (variable length headers not yet supported). */ static void _bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; bp = kmem_alloc(sizeof(*bp), KM_NOSLEEP); if (bp == NULL) panic("bpfattach"); mutex_enter(&bpf_mtx); bp->bif_driverp = driverp; bp->bif_ifp = ifp; bp->bif_dlt = dlt; bp->bif_si = NULL; BPF_IFLIST_ENTRY_INIT(bp); PSLIST_INIT(&bp->bif_dlist_head); psref_target_init(&bp->bif_psref, bpf_psref_class); BPF_IFLIST_WRITER_INSERT_HEAD(bp); *bp->bif_driverp = NULL; bp->bif_hdrlen = hdrlen; mutex_exit(&bpf_mtx); #if 0 printf("bpf: %s attached\n", ifp->if_xname); #endif } static void _bpf_mtap_softint_init(struct ifnet *ifp) { struct bpf_if *bp; mutex_enter(&bpf_mtx); BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_ifp != ifp) continue; bp->bif_mbuf_head = NULL; bp->bif_mbuf_tail = NULL; bp->bif_si = softint_establish(SOFTINT_NET, bpf_mtap_si, bp); if (bp->bif_si == NULL) panic("%s: softint_establish() failed", __func__); break; } mutex_exit(&bpf_mtx); if (bp == NULL) panic("%s: no bpf_if found for %s", __func__, ifp->if_xname); } /* * Remove an interface from bpf. */ static void _bpfdetach(struct ifnet *ifp) { struct bpf_if *bp; struct bpf_d *d; int s; mutex_enter(&bpf_mtx); /* Nuke the vnodes for any open instances */ again_d: BPF_DLIST_WRITER_FOREACH(d) { mutex_enter(d->bd_mtx); if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) { /* * Detach the descriptor from an interface now. * It will be free'ed later by close routine. */ d->bd_promisc = 0; /* we can't touch device. */ bpf_detachd(d); mutex_exit(d->bd_mtx); goto again_d; } mutex_exit(d->bd_mtx); } again: BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_ifp == ifp) { BPF_IFLIST_WRITER_REMOVE(bp); pserialize_perform(bpf_psz); psref_target_destroy(&bp->bif_psref, bpf_psref_class); BPF_IFLIST_ENTRY_DESTROY(bp); if (bp->bif_si != NULL) { /* XXX NOMPSAFE: assumed running on one CPU */ s = splnet(); while (bp->bif_mbuf_head != NULL) { struct mbuf *m = bp->bif_mbuf_head; bp->bif_mbuf_head = m->m_nextpkt; m_freem(m); } splx(s); softint_disestablish(bp->bif_si); } kmem_free(bp, sizeof(*bp)); goto again; } } mutex_exit(&bpf_mtx); } /* * Change the data link type of a interface. */ static void _bpf_change_type(struct ifnet *ifp, u_int dlt, u_int hdrlen) { struct bpf_if *bp; mutex_enter(&bpf_mtx); BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_driverp == &ifp->if_bpf) break; } if (bp == NULL) panic("bpf_change_type"); bp->bif_dlt = dlt; bp->bif_hdrlen = hdrlen; mutex_exit(&bpf_mtx); } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { int n, error; struct ifnet *ifp; struct bpf_if *bp; int s, bound; KASSERT(mutex_owned(d->bd_mtx)); ifp = d->bd_bif->bif_ifp; n = 0; error = 0; bound = curlwp_bind(); s = pserialize_read_enter(); BPF_IFLIST_READER_FOREACH(bp) { if (bp->bif_ifp != ifp) continue; if (bfl->bfl_list != NULL) { struct psref psref; if (n >= bfl->bfl_len) { pserialize_read_exit(s); return ENOMEM; } bpf_if_acquire(bp, &psref); pserialize_read_exit(s); error = copyout(&bp->bif_dlt, bfl->bfl_list + n, sizeof(u_int)); s = pserialize_read_enter(); bpf_if_release(bp, &psref); } n++; } pserialize_read_exit(s); curlwp_bindx(bound); bfl->bfl_len = n; return error; } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; KASSERT(mutex_owned(&bpf_mtx)); KASSERT(mutex_owned(d->bd_mtx)); if (d->bd_bif->bif_dlt == dlt) return 0; ifp = d->bd_bif->bif_ifp; BPF_IFLIST_WRITER_FOREACH(bp) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } if (bp == NULL) return EINVAL; opromisc = d->bd_promisc; bpf_detachd(d); BPFIF_DLIST_ENTRY_INIT(d); bpf_attachd(d, bp); reset_d(d); if (opromisc) { KERNEL_LOCK_UNLESS_NET_MPSAFE(); error = ifpromisc(bp->bif_ifp, 1); KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); if (error) printf("%s: bpf_setdlt: ifpromisc failed (%d)\n", bp->bif_ifp->if_xname, error); else d->bd_promisc = 1; } return 0; } static int sysctl_net_bpf_maxbufsize(SYSCTLFN_ARGS) { int newsize, error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newsize; newsize = bpf_maxbufsize; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return (error); if (newsize < BPF_MINBUFSIZE || newsize > BPF_MAXBUFSIZE) return (EINVAL); bpf_maxbufsize = newsize; return (0); } #if defined(MODULAR) || defined(BPFJIT) static int sysctl_net_bpf_jit(SYSCTLFN_ARGS) { bool newval; int error; struct sysctlnode node; node = *rnode; node.sysctl_data = &newval; newval = bpf_jit; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; bpf_jit = newval; /* * Do a full sync to publish new bpf_jit value and * update bpfjit_module_ops.bj_generate_code variable. */ membar_sync(); if (newval && bpfjit_module_ops.bj_generate_code == NULL) { printf("JIT compilation is postponed " "until after bpfjit module is loaded\n"); } return 0; } #endif static int sysctl_net_bpf_peers(SYSCTLFN_ARGS) { int error, elem_count; struct bpf_d *dp; struct bpf_d_ext dpe; size_t len, needed, elem_size, out_size; char *sp; if (namelen == 1 && name[0] == CTL_QUERY) return (sysctl_query(SYSCTLFN_CALL(rnode))); if (namelen != 2) return (EINVAL); /* BPF peers is privileged information. */ error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE, KAUTH_REQ_NETWORK_INTERFACE_GETPRIV, NULL, NULL, NULL); if (error) return (EPERM); len = (oldp != NULL) ? *oldlenp : 0; sp = oldp; elem_size = name[0]; elem_count = name[1]; out_size = MIN(sizeof(dpe), elem_size); needed = 0; if (elem_size < 1 || elem_count < 0) return (EINVAL); mutex_enter(&bpf_mtx); BPF_DLIST_WRITER_FOREACH(dp) { if (len >= elem_size && elem_count > 0) { #define BPF_EXT(field) dpe.bde_ ## field = dp->bd_ ## field BPF_EXT(bufsize); BPF_EXT(promisc); BPF_EXT(state); BPF_EXT(immediate); BPF_EXT(hdrcmplt); BPF_EXT(direction); BPF_EXT(pid); BPF_EXT(rcount); BPF_EXT(dcount); BPF_EXT(ccount); #undef BPF_EXT mutex_enter(dp->bd_mtx); if (dp->bd_bif) (void)strlcpy(dpe.bde_ifname, dp->bd_bif->bif_ifp->if_xname, IFNAMSIZ - 1); else dpe.bde_ifname[0] = '\0'; mutex_exit(dp->bd_mtx); error = copyout(&dpe, sp, out_size); if (error) break; sp += elem_size; len -= elem_size; } needed += elem_size; if (elem_count > 0 && elem_count != INT_MAX) elem_count--; } mutex_exit(&bpf_mtx); *oldlenp = needed; return (error); } static void bpf_stats(void *p, void *arg, struct cpu_info *ci __unused) { struct bpf_stat *const stats = p; struct bpf_stat *sum = arg; sum->bs_recv += stats->bs_recv; sum->bs_drop += stats->bs_drop; sum->bs_capt += stats->bs_capt; } static int bpf_sysctl_gstats_handler(SYSCTLFN_ARGS) { struct sysctlnode node; int error; struct bpf_stat sum; memset(&sum, 0, sizeof(sum)); node = *rnode; percpu_foreach(bpf_gstats_percpu, bpf_stats, &sum); node.sysctl_data = ∑ node.sysctl_size = sizeof(sum); error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error != 0 || newp == NULL) return error; return 0; } static struct sysctllog *bpf_sysctllog; static void sysctl_net_bpf_setup(void) { const struct sysctlnode *node; node = NULL; sysctl_createv(&bpf_sysctllog, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "bpf", SYSCTL_DESCR("BPF options"), NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL); if (node != NULL) { #if defined(MODULAR) || defined(BPFJIT) sysctl_createv(&bpf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "jit", SYSCTL_DESCR("Toggle Just-In-Time compilation"), sysctl_net_bpf_jit, 0, &bpf_jit, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); #endif sysctl_createv(&bpf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxbufsize", SYSCTL_DESCR("Maximum size for data capture buffer"), sysctl_net_bpf_maxbufsize, 0, &bpf_maxbufsize, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(&bpf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "stats", SYSCTL_DESCR("BPF stats"), bpf_sysctl_gstats_handler, 0, NULL, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); sysctl_createv(&bpf_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "peers", SYSCTL_DESCR("BPF peers"), sysctl_net_bpf_peers, 0, NULL, 0, CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL); } } struct bpf_ops bpf_ops_kernel = { .bpf_attach = _bpfattach, .bpf_detach = _bpfdetach, .bpf_change_type = _bpf_change_type, .bpf_mtap = _bpf_mtap, .bpf_mtap2 = _bpf_mtap2, .bpf_mtap_af = _bpf_mtap_af, .bpf_mtap_sl_in = _bpf_mtap_sl_in, .bpf_mtap_sl_out = _bpf_mtap_sl_out, .bpf_mtap_softint = _bpf_mtap_softint, .bpf_mtap_softint_init = _bpf_mtap_softint_init, }; MODULE(MODULE_CLASS_DRIVER, bpf, "bpf_filter"); static int bpf_modcmd(modcmd_t cmd, void *arg) { #ifdef _MODULE devmajor_t bmajor, cmajor; #endif int error = 0; switch (cmd) { case MODULE_CMD_INIT: bpf_init(); #ifdef _MODULE bmajor = cmajor = NODEVMAJOR; error = devsw_attach("bpf", NULL, &bmajor, &bpf_cdevsw, &cmajor); if (error) break; #endif bpf_ops_handover_enter(&bpf_ops_kernel); atomic_swap_ptr(&bpf_ops, &bpf_ops_kernel); bpf_ops_handover_exit(); sysctl_net_bpf_setup(); break; case MODULE_CMD_FINI: /* * While there is no reference counting for bpf callers, * unload could at least in theory be done similarly to * system call disestablishment. This should even be * a little simpler: * * 1) replace op vector with stubs * 2) post update to all cpus with xc * 3) check that nobody is in bpf anymore * (it's doubtful we'd want something like l_sysent, * but we could do something like *signed* percpu * counters. if the sum is 0, we're good). * 4) if fail, unroll changes * * NOTE: change won't be atomic to the outside. some * packets may be not captured even if unload is * not succesful. I think packet capture not working * is a perfectly logical consequence of trying to * disable packet capture. */ error = EOPNOTSUPP; /* insert sysctl teardown */ break; default: error = ENOTTY; break; } return error; }