Index: kern/kern_mib.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_mib.c,v retrieving revision 1.29.2.4 diff -c -r1.29.2.4 kern_mib.c *** kern/kern_mib.c 30 Jul 2001 23:28:00 -0000 1.29.2.4 --- kern/kern_mib.c 15 Aug 2003 19:38:03 -0000 *************** *** 46,51 **** --- 46,52 ---- #include #include #include + #include #include SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, *************** *** 248,250 **** --- 249,269 ---- #include SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD, 0, sizeof(struct specinfo), "sizeof(struct specinfo)"); + + /* support for sendfile statistics */ + SYSCTL_NODE(_kern, KERN_SENDFILE, sendfile, CTLFLAG_RW, 0, "sendfile statistics"); + + #define KIPC_SNDF_NSFBUFS 1 /* number of sendfile bufs */ + #define KIPC_SNDF_NSFPGS 2 /* number of sendfile cached pages */ + #define KIPC_SNDF_NLRUPGS 3 /* number of sendfile free pages */ + #define KIPC_SNDF_TOSEC 4 + + SYSCTL_INT(_kern_sendfile, KSNDF_NSFBUFS, nsfbufs, CTLFLAG_RD, + &nsfbufs, 0, ""); + SYSCTL_INT(_kern_sendfile, KSNDF_NSFPGS, numsfpages, CTLFLAG_RD, + &numsfpages, 0, ""); + SYSCTL_INT(_kern_sendfile, KSNDF_NLRUPGS, lrusfpages, CTLFLAG_RD, + &lrusfpages, 0, ""); + SYSCTL_INT(_kern_sendfile, KSNDF_TOSEC, sf_cache_timeout_sec, CTLFLAG_RW, &sf_cache_timeout_sec, 0, ""); + /* end */ + Index: kern/uipc_syscalls.c =================================================================== RCS file: /home/ncvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.65.2.9.6.1 diff -c -r1.65.2.9.6.1 uipc_syscalls.c *** kern/uipc_syscalls.c 13 Aug 2002 12:12:41 -0000 1.65.2.9.6.1 --- kern/uipc_syscalls.c 15 Aug 2003 19:38:07 -0000 *************** *** 59,64 **** --- 59,68 ---- #include #include #include + #include + #include + #include + #include #ifdef KTRACE #include #endif *************** *** 74,79 **** --- 78,86 ---- static struct sf_buf *sf_buf_alloc(void); static void sf_buf_ref(caddr_t addr, u_int size); static void sf_buf_free(caddr_t addr, u_int size); + static void sf_buf_timeout(void *arg); + static struct sf_buf *sf_page_lookup(vm_page_t pg); + static void sf_cache_insert(vm_page_t pg, struct sf_buf *sf); static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags)); static int recvit __P((struct proc *p, int s, struct msghdr *mp, *************** *** 1422,1427 **** --- 1429,1487 ---- return(error); } + + /*** support for sendfile optimization ***/ + + /* global variables exported by sysctl: + * lrusfpages: # of sendfile buffers in LRU list (available free buffer list) + * numsfpages: # of sendfile buffers cached + * sf_cache_timeout_sec: timeout after the last sendfile system call in seconds + * Any postive number of this enables the caching + * Set to 0 disables the sendfile buffer caching (default setting) + */ + + int numsfpages; + int lrusfpages; + int sf_cache_timeout_sec; + + typedef struct sf_buf *sf_buf_p; + static sf_buf_p *sfcBins; + static struct sf_buf *sfc_head, *sfc_tail; + static struct callout sf_timeout; + static int sf_tticks, sf_prev_tsec; + + #define address_hash(key) \ + ((((unsigned long)(key) >> 3)) & (nsfbufs - 1)) + + /* + ((((unsigned int)(key) >> 3) * 2654435761) & SFC_MASK) + */ + + #define sf_buf_LRU_remove(ent) \ + { \ + lrusfpages--; \ + if ((ent)->sf_prev) \ + (ent)->sf_prev->sf_next = (ent)->sf_next;\ + else \ + sfc_head = (ent)->sf_next; \ + if ((ent)->sf_next) \ + (ent)->sf_next->sf_prev = (ent)->sf_prev;\ + else \ + sfc_tail = (ent)->sf_prev; \ + } + + #define sf_buf_LRU_head_insert(ent) \ + { \ + lrusfpages++; \ + (ent)->sf_prev = NULL; \ + (ent)->sf_next = sfc_head; \ + if (sfc_head) \ + sfc_head->sf_prev = (ent); \ + else \ + sfc_tail = (ent); \ + sfc_head = (ent); \ + } + /* * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) * XXX - The sf_buf functions are currently private to sendfile(2), so have *************** *** 1432,1437 **** --- 1492,1498 ---- sf_buf_init(void *arg) { int i; + struct sf_buf *sf; SLIST_INIT(&sf_freelist); sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); *************** *** 1439,1524 **** bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf)); for (i = 0; i < nsfbufs; i++) { sf_bufs[i].kva = sf_base + i * PAGE_SIZE; ! SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list); } } /* ! * Get an sf_buf from the freelist. Will block if none are available. */ static struct sf_buf * sf_buf_alloc() { struct sf_buf *sf; int s; int error; s = splimp(); ! while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) { sf_buf_alloc_want = 1; ! error = tsleep(&sf_freelist, PVM|PCATCH, "sfbufa", 0); if (error) break; } if (sf != NULL) { ! SLIST_REMOVE_HEAD(&sf_freelist, free_list); ! sf->refcnt = 1; } splx(s); return (sf); } - #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) static void ! sf_buf_ref(caddr_t addr, u_int size) { struct sf_buf *sf; ! sf = dtosf(addr); ! if (sf->refcnt == 0) ! panic("sf_buf_ref: referencing a free sf_buf"); ! sf->refcnt++; } ! /* ! * Lose a reference to an sf_buf. When none left, detach mapped page ! * and release resources back to the system. ! * ! * Must be called at splimp. */ static void ! sf_buf_free(caddr_t addr, u_int size) { ! struct sf_buf *sf; ! struct vm_page *m; ! int s; ! sf = dtosf(addr); ! if (sf->refcnt == 0) ! panic("sf_buf_free: freeing free sf_buf"); ! sf->refcnt--; ! if (sf->refcnt == 0) { ! pmap_qremove((vm_offset_t)addr, 1); ! m = sf->m; ! s = splvm(); ! vm_page_unwire(m, 0); ! /* ! * Check for the object going away on us. This can ! * happen since we don't hold a reference to it. ! * If so, we're responsible for freeing the page. ! */ ! if (m->wire_count == 0 && m->object == NULL) ! vm_page_free(m); ! splx(s); ! sf->m = NULL; ! SLIST_INSERT_HEAD(&sf_freelist, sf, free_list); ! if (sf_buf_alloc_want) { ! sf_buf_alloc_want = 0; ! wakeup(&sf_freelist); ! } ! } } /* * sendfile(2). * int sendfile(int fd, int s, off_t offset, size_t nbytes, --- 1500,1790 ---- bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf)); for (i = 0; i < nsfbufs; i++) { sf_bufs[i].kva = sf_base + i * PAGE_SIZE; ! ! /* create the LRU list*/ ! sf = &sf_bufs[i]; ! sf_buf_LRU_head_insert(sf); ! sf->sf_isOnLRU = 1; ! callout_init(&sf_timeout); } + + /* allocate number of nsfbufs cache entries (hash bins) */ + sfcBins = malloc(nsfbufs * sizeof(sf_buf_p), M_TEMP, M_NOWAIT); + bzero(sfcBins, nsfbufs * sizeof(sf_buf_p)); + + sf_cache_timeout_sec = 300; + } + + #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) + static void + sf_buf_ref(caddr_t addr, u_int size) + { + struct sf_buf *sf; + + sf = dtosf(addr); + if (sf->refcnt == 0) { + panic("sf_buf_ref: referencing a free sf_buf"); + } + sf->refcnt++; + + return; + } + + /* Remove pmap entry, free wired pages. Called by + * 1. sf_buf_free if sf_buf cache is NOT enabled + * 2. sf_buf_alloc if the page on the LRU tail needs to be freed + * 3. sf_buf_timeout + */ + static void + sf_page_free(struct sf_buf *sf) + { + struct vm_page *m; + int s; + + numsfpages--; + + pmap_qremove((vm_offset_t)sf->kva, 1); + m = sf->m; + s = splvm(); + vm_page_unwire(m, 0); + /* + * Check for the object going away on us. This can + * happen since we don't hold a reference to it. + * If so, we're responsible for freeing the page. + */ + if (m->wire_count == 0 && m->object == NULL) + vm_page_free(m); + splx(s); + sf->m = NULL; } /* ! * Lose a reference to an sf_buf. When none left, detach mapped page ! * and release resources back to the system. ! * ! * Must be called at splimp. ! */ ! static void ! sf_buf_free(caddr_t addr, u_int size) ! { ! struct sf_buf *sf; ! ! sf = dtosf(addr); ! if (sf->refcnt == 0) ! panic("sf_buf_free: freeing free sf_buf"); ! ! sf->refcnt--; ! ! /* Don't free the page if caching is enabled */ ! if (sf_cache_timeout_sec && (sf->refcnt == 1) ) { ! sf_buf_LRU_head_insert(sf); ! sf->sf_isOnLRU = 1; ! if (sf_buf_alloc_want) { ! sf_buf_alloc_want = 0; ! wakeup(sfc_tail); ! } ! return; ! } ! ! /* Code reaches here only when caching is NOT enabled */ ! if (sf->refcnt == 0) { ! ! sf_page_free(sf); ! ! /* Also use LRU list for the free buffer list */ ! if (!sf->sf_isOnLRU) { ! sf_buf_LRU_head_insert(sf); ! sf->sf_isOnLRU = 1; ! if (sf_buf_alloc_want) { ! sf_buf_alloc_want = 0; ! wakeup(sfc_tail); ! } ! } ! } ! ! return; ! } ! ! /* When caching is NOT enabled (sf_cache_timeout_sec == 0): ! * Grab the LRU list tail and simply return. ! * When caching is enabled (sf_cache_timeout_sec > 0): ! * Free tail from the LRu list ! * Remove from the hash entry ! * Free the wired page */ static struct sf_buf * sf_buf_alloc() { struct sf_buf *sf; int s; + int hashBin; + vm_page_t pg; int error; + numsfpages ++; + s = splimp(); ! while ((sf = sfc_tail) == NULL) { sf_buf_alloc_want = 1; ! ! /* this should rarely happen when caching is enabled */ ! log(LOG_INFO, "sfbufa"); ! ! error = tsleep(sfc_tail, PVM|PCATCH, "sfbufa", 0); if (error) break; } + if (sf != NULL) { ! sf_buf_LRU_remove(sf); ! ! /* Buffers in the list should have refcnt 1 or 0 */ ! if (sf->refcnt > 1) ! panic("sf_cached_buf_allo: refcnt > 1"); ! ! /* need to free */ ! if (sf->refcnt == 1) { ! struct sf_buf *walk; ! ! /* remove from Hash entry */ ! pg = sf->m; ! hashBin = address_hash(pg); ! ! for (walk = sfcBins[hashBin]; walk; walk = walk->sf_nextHash) { ! if (walk == sf) { ! ! if (walk == sfcBins[hashBin]) { ! if (walk->sf_nextHash) ! walk->sf_nextHash->sf_prevHash = NULL; ! sfcBins[hashBin] = walk->sf_nextHash; ! } else { ! if (walk->sf_nextHash) ! walk->sf_nextHash->sf_prevHash = walk->sf_prevHash; ! *walk->sf_prevHash = walk->sf_nextHash; ! } ! ! break; ! } ! } ! ! /* free from pmap and unwire the page */ ! sf->refcnt --; ! sf_page_free(sf); ! } ! ! sf->refcnt = 1; ! sf->sf_isOnLRU = 0; } + splx(s); return (sf); } static void ! sf_buf_timeout(void *arg) { struct sf_buf *sf; ! /* Some buffers are still in using, haven't been called by sf_buf_free ! * Wait a while then come back again ! */ ! if (lrusfpages != nsfbufs) { ! log(LOG_INFO, "sendfile timeout delayed\n"); ! callout_reset(&sf_timeout, 10000, sf_buf_timeout, (void *) 0); ! return; ! } ! ! for (sf = sfc_tail; sf; sf = sf->sf_prev) { ! ! /* Clear link fields for hash entry*/ ! sf->sf_nextHash = NULL; ! sf->sf_prevHash = NULL; ! ! if (sf->refcnt == 0) ! continue; ! ! sf->refcnt--; ! ! /* free wired pages */ ! if (sf->refcnt == 0) ! sf_page_free(sf); ! else ! panic("sf_buf_timeout: refcnt not 0"); ! } ! ! /* need to clear hash bins */ ! bzero(sfcBins, nsfbufs * sizeof(sf_buf_p)); ! ! return; } ! /* Return cached sf_buf, bring it to the LRU head ! * The target sf_buf could be in LRU list already ! * or still an active one. So we need to increase refcnt in advance */ + static struct sf_buf * + sf_page_lookup(vm_page_t pg) + { + int hashBin; + struct sf_buf *walk, *ret = NULL; + int s; + + if (!sf_cache_timeout_sec) + return (NULL); + + s = splimp(); + + hashBin = address_hash(pg); + + for (walk = sfcBins[hashBin]; walk; walk = walk->sf_nextHash) { + if (walk->m == pg) { + /* move to LRU head */ + if (walk->sf_isOnLRU) { + sf_buf_LRU_remove(walk); + } + + walk->sf_isOnLRU = 0; + + /* increase ref_cnt in advance to avoid being freed */ + walk->refcnt++; + + ret = walk; + break; + } + } + + splx(s); + + return (ret); + } + + /* This is a newly wired page, put into cache entry */ static void ! sf_cache_insert(vm_page_t pg, struct sf_buf *sf) { ! int hashBin; ! if (!sf_cache_timeout_sec) ! return; ! ! hashBin = address_hash(pg); ! ! /* insert into Hash entry */ ! sf->sf_prevHash = &(sfcBins[hashBin]); ! sf->sf_nextHash = sfcBins[hashBin]; ! ! if (sfcBins[hashBin]) ! (sfcBins[hashBin])->sf_prevHash = &sf->sf_nextHash; ! sfcBins[hashBin] = sf; ! ! /* increase ref_cnt in advance to avoid being freed */ ! sf->refcnt++; ! ! return; } + /*** end ***/ + /* * sendfile(2). * int sendfile(int fd, int s, off_t offset, size_t nbytes, *************** *** 1529,1534 **** --- 1795,1805 ---- * nbytes == 0. Optionally add a header and/or trailer to the socket * output. If specified, write the total number of bytes sent into *sbytes. */ + + /*** if flags is specified to 1, don't initiate IO if the page is not valid + *** but reutn 999 instead + ***/ + int sendfile(struct proc *p, struct sendfile_args *uap) { *************** *** 1537,1543 **** struct vnode *vp; struct vm_object *obj; struct socket *so; ! struct mbuf *m; struct sf_buf *sf; struct vm_page *pg; struct writev_args nuap; --- 1808,1814 ---- struct vnode *vp; struct vm_object *obj; struct socket *so; ! struct mbuf *m, *hm = NULL; struct sf_buf *sf; struct vm_page *pg; struct writev_args nuap; *************** *** 1545,1555 **** off_t off, xfsize, sbytes = 0; int error = 0, s; vp = NULL; ! /* ! * Do argument checking. Must be a regular file in, stream ! * type and connected socket out, positive offset. ! */ fp = holdfp(fdp, uap->fd, FREAD); if (fp == NULL) { error = EBADF; --- 1816,1825 ---- off_t off, xfsize, sbytes = 0; int error = 0, s; + int headSent = 0, head_len = 0; + vp = NULL; ! fp = holdfp(fdp, uap->fd, FREAD); if (fp == NULL) { error = EBADF; *************** *** 1583,1631 **** goto done; } - /* - * If specified, get the pointer to the sf_hdtr struct for - * any headers/trailers. - */ if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error) goto done; - /* - * Send any headers. Wimp out and use writev(2). - */ - if (hdtr.headers != NULL) { - nuap.fd = uap->s; - nuap.iovp = hdtr.headers; - nuap.iovcnt = hdtr.hdr_cnt; - error = writev(p, &nuap); - if (error) - goto done; - sbytes += p->p_retval[0]; - } } - /* - * Protect against multiple writers to the socket. - */ (void) sblock(&so->so_snd, M_WAITOK); ! /* ! * Loop through the pages in the file, starting with the requested ! * offset. Get a file page (do I/O if necessary), map the file page ! * into an sf_buf, attach an mbuf header to the sf_buf, and queue ! * it on the socket. ! */ for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { vm_pindex_t pindex; vm_offset_t pgoff; pindex = OFF_TO_IDX(off); retry_lookup: - /* - * Calculate the amount to transfer. Not to exceed a page, - * the EOF, or the passed in nbytes. - */ xfsize = obj->un_pager.vnp.vnp_size - off; if (xfsize > PAGE_SIZE) xfsize = PAGE_SIZE; --- 1853,1886 ---- goto done; } if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error) goto done; } (void) sblock(&so->so_snd, M_WAITOK); ! /* finally timeout setup could be moved into sf_buf_init */ ! if (sf_cache_timeout_sec != sf_prev_tsec) { ! struct timeval tv; ! ! tv.tv_sec = sf_cache_timeout_sec; ! tv.tv_usec = 0; ! ! sf_tticks = tvtohz(&tv); ! sf_prev_tsec = sf_cache_timeout_sec; ! } ! ! if (sf_cache_timeout_sec) ! callout_reset(&sf_timeout, sf_tticks, sf_buf_timeout, (void *) 0); ! for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { vm_pindex_t pindex; vm_offset_t pgoff; pindex = OFF_TO_IDX(off); retry_lookup: xfsize = obj->un_pager.vnp.vnp_size - off; if (xfsize > PAGE_SIZE) xfsize = PAGE_SIZE; *************** *** 1636,1645 **** xfsize = uap->nbytes - sbytes; if (xfsize <= 0) break; - /* - * Optimize the non-blocking case by looking at the socket space - * before going to the extra work of constituting the sf_buf. - */ if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { if (so->so_state & SS_CANTSENDMORE) error = EPIPE; --- 1891,1896 ---- *************** *** 1647,1689 **** error = EAGAIN; sbunlock(&so->so_snd); goto done; ! } ! /* ! * Attempt to look up the page. ! * ! * Allocate if not found ! * ! * Wait and loop if busy. ! */ pg = vm_page_lookup(obj, pindex); ! if (pg == NULL) { ! pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); ! if (pg == NULL) { ! VM_WAIT; ! goto retry_lookup; ! } ! vm_page_wakeup(pg); } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { ! goto retry_lookup; } - /* - * Wire the page so it does not get ripped out from under - * us. - */ - vm_page_wire(pg); ! /* ! * If page is not valid for what we need, initiate I/O */ if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { struct uio auio; struct iovec aiov; int bsize; - /* * Ensure that our page is still around when the I/O * completes. --- 1898,1936 ---- error = EAGAIN; sbunlock(&so->so_snd); goto done; ! } ! ! /* modified part begin */ ! ! /* look up in the hot page cache */ ! pg = vm_page_lookup(obj, pindex); ! if (pg == NULL) { ! ! if (!(pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL))) { ! VM_WAIT; ! goto retry_lookup; ! } ! vm_page_wakeup(pg); } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { ! goto retry_lookup; ! } else if ((sf = sf_page_lookup(pg))) { ! goto begin_send; } vm_page_wire(pg); ! /* If page is not valid for what we need ! * initiate I/O if flag is 0 ! * return 999 if flag is SF_NONIO */ if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { + if (uap->flags == 0) { struct uio auio; struct iovec aiov; int bsize; /* * Ensure that our page is still around when the I/O * completes. *************** *** 1724,1736 **** sbunlock(&so->so_snd); goto done; } ! } ! ! /* ! * Get a sendfile buf. We usually wait as long as necessary, ! * but this wait can be interrupted. ! */ if ((sf = sf_buf_alloc()) == NULL) { s = splvm(); vm_page_unwire(pg, 0); --- 1971,1992 ---- sbunlock(&so->so_snd); goto done; } ! } ! else if (uap->flags == SF_NONIO) { ! vm_page_unwire(pg, 0); ! if (pg->wire_count == 0 && pg->valid == 0 && ! pg->busy == 0 && !(pg->flags & PG_BUSY) && ! pg->hold_count == 0) { ! vm_page_busy(pg); ! vm_page_free(pg); ! } ! sbunlock(&so->so_snd); ! error = 999; ! goto done; ! } ! } /* page is not valid */ ! /* Get a sendfile buf */ if ((sf = sf_buf_alloc()) == NULL) { s = splvm(); vm_page_unwire(pg, 0); *************** *** 1742,1788 **** goto done; } - - /* - * Allocate a kernel virtual page and insert the physical page - * into it. - */ - sf->m = pg; pmap_qenter(sf->kva, &pg, 1); ! /* ! * Get an mbuf header and set it up as having external storage. ! */ MGETHDR(m, M_WAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; - sf_buf_free((void *)sf->kva, PAGE_SIZE); sbunlock(&so->so_snd); goto done; } m->m_ext.ext_free = sf_buf_free; m->m_ext.ext_ref = sf_buf_ref; m->m_ext.ext_buf = (void *)sf->kva; m->m_ext.ext_size = PAGE_SIZE; m->m_data = (char *) sf->kva + pgoff; m->m_flags |= M_EXT; ! m->m_pkthdr.len = m->m_len = xfsize; ! /* ! * Add the buffer to the socket buffer chain. ! */ s = splnet(); retry_space: - /* - * Make sure that the socket is still able to take more data. - * CANTSENDMORE being true usually means that the connection - * was closed. so_error is true when an error was sensed after - * a previous send. - * The state is checked after the page mapping and buffer - * allocation above since those operations may block and make - * any socket checks stale. From this point forward, nothing - * blocks before the pru_send (or more accurately, any blocking - * results in a loop back to here to re-check). - */ if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; --- 1998,2070 ---- goto done; } sf->m = pg; pmap_qenter(sf->kva, &pg, 1); ! ! sf_cache_insert(pg, sf); ! ! /* get mbuf and chain into the socket */ ! begin_send: MGETHDR(m, M_WAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sbunlock(&so->so_snd); goto done; } + /* only decrease ref count, but not free page */ + m->m_ext.ext_free = sf_buf_free; m->m_ext.ext_ref = sf_buf_ref; m->m_ext.ext_buf = (void *)sf->kva; m->m_ext.ext_size = PAGE_SIZE; m->m_data = (char *) sf->kva + pgoff; m->m_flags |= M_EXT; ! m->m_len = xfsize; ! ! /* assemble header into the first packet as mbuf chain */ ! if (uap->hdtr != NULL && ! hdtr.headers != NULL && ! !headSent) { ! int i; ! struct iovec *iov; ! ! MGETHDR(hm, M_WAIT, MT_DATA); ! if (hm == NULL) { ! error = ENOBUFS; ! m_freem(m); ! sbunlock(&so->so_snd); ! goto done; ! } ! ! /* get a cluster for header space */ ! MCLGET(hm, M_WAIT); ! if ((hm->m_flags & M_EXT) == 0) { ! m_freem(m); ! sbunlock(&so->so_snd); ! goto done; ! } ! ! /* any limit for cluster size 2048 ??? */ ! for (i = 0; i < hdtr.hdr_cnt; i++) { ! iov = &(uap->hdtr->headers[i]); ! error = copyin(iov->iov_base, ! (char *)(hm->m_data + head_len), ! iov->iov_len); ! head_len += iov->iov_len; ! } ! ! if (head_len > 2048) ! log(LOG_INFO, "sendfile header length > 2048"); ! ! hm->m_pkthdr.len = head_len + xfsize; ! hm->m_len = head_len; ! hm->m_next = m; ! } ! else ! m->m_pkthdr.len = xfsize; ! s = splnet(); retry_space: if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; *************** *** 1790,1849 **** error = so->so_error; so->so_error = 0; } ! m_freem(m); sbunlock(&so->so_snd); splx(s); goto done; } ! /* ! * Wait for socket space to become available. We do this just ! * after checking the connection state above in order to avoid ! * a race condition with sbwait(). ! */ if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { if (so->so_state & SS_NBIO) { ! m_freem(m); sbunlock(&so->so_snd); splx(s); error = EAGAIN; goto done; } error = sbwait(&so->so_snd); - /* - * An error from sbwait usually indicates that we've - * been interrupted by a signal. If we've sent anything - * then return bytes sent, otherwise return the error. - */ if (error) { ! m_freem(m); sbunlock(&so->so_snd); splx(s); goto done; } goto retry_space; } ! error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p); splx(s); if (error) { sbunlock(&so->so_snd); goto done; } } sbunlock(&so->so_snd); - /* - * Send trailers. Wimp out and use writev(2). - */ if (uap->hdtr != NULL && hdtr.trailers != NULL) { nuap.fd = uap->s; nuap.iovp = hdtr.trailers; nuap.iovcnt = hdtr.trl_cnt; error = writev(p, &nuap); ! if (error) goto done; sbytes += p->p_retval[0]; } - done: if (uap->sbytes != NULL) { copyout(&sbytes, uap->sbytes, sizeof(off_t)); --- 2072,2149 ---- error = so->so_error; so->so_error = 0; } ! /* ! if (hm && !headSent) ! m_freem(hm); ! else ! */ ! m_freem(m); sbunlock(&so->so_snd); splx(s); goto done; } ! if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { if (so->so_state & SS_NBIO) { ! /* ! if (hm && !headSent) ! m_freem(hm); ! else ! */ ! m_freem(m); sbunlock(&so->so_snd); splx(s); error = EAGAIN; goto done; } error = sbwait(&so->so_snd); if (error) { ! /* ! if (hm && !headSent) ! m_freem(hm); ! else ! */ ! m_freem(m); sbunlock(&so->so_snd); splx(s); goto done; } goto retry_space; } ! ! if (uap->hdtr != NULL && ! hdtr.headers != NULL && ! !headSent) ! error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, hm, 0, 0, p); ! else ! error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p); ! splx(s); if (error) { sbunlock(&so->so_snd); goto done; } + + if (uap->hdtr != NULL && + hdtr.headers != NULL && + !headSent) { + sbytes += head_len; + headSent = 1; + } + } sbunlock(&so->so_snd); if (uap->hdtr != NULL && hdtr.trailers != NULL) { nuap.fd = uap->s; nuap.iovp = hdtr.trailers; nuap.iovcnt = hdtr.trl_cnt; error = writev(p, &nuap); ! if (error) { goto done; + } sbytes += p->p_retval[0]; } done: if (uap->sbytes != NULL) { copyout(&sbytes, uap->sbytes, sizeof(off_t)); *************** *** 1852,1856 **** --- 2152,2157 ---- vrele(vp); if (fp) fdrop(fp, p); + return (error); } Index: sys/socketvar.h =================================================================== RCS file: /home/ncvs/src/sys/sys/socketvar.h,v retrieving revision 1.46.2.8 diff -c -r1.46.2.8 socketvar.h *** sys/socketvar.h 1 May 2002 03:26:32 -0000 1.46.2.8 --- sys/socketvar.h 15 Aug 2003 19:38:09 -0000 *************** *** 274,280 **** --- 274,288 ---- int refcnt; /* reference count */ struct vm_page *m; /* currently mapped page */ vm_offset_t kva; /* va of mapping */ + struct sf_buf *sf_next; /* next in LRU */ + struct sf_buf *sf_prev; /* prev in LRU */ + struct sf_buf *sf_nextHash; /* next in hash entry */ + struct sf_buf **sf_prevHash; /* prev in hash entry */ + int sf_isOnLRU; }; + + /* sendfile flags */ + #define SF_NONIO 1 struct accept_filter { char accf_name[16]; Index: sys/sysctl.h =================================================================== RCS file: /home/ncvs/src/sys/sys/sysctl.h,v retrieving revision 1.81.2.8 diff -c -r1.81.2.8 sysctl.h *** sys/sysctl.h 17 Mar 2002 11:08:38 -0000 1.81.2.8 --- sys/sysctl.h 15 Aug 2003 19:38:10 -0000 *************** *** 331,337 **** #define KERN_PS_STRINGS 32 /* int: address of PS_STRINGS */ #define KERN_USRSTACK 33 /* int: address of USRSTACK */ #define KERN_LOGSIGEXIT 34 /* int: do we log sigexit procs? */ ! #define KERN_MAXID 35 /* number of valid kern ids */ #define CTL_KERN_NAMES { \ { 0, 0 }, \ --- 331,338 ---- #define KERN_PS_STRINGS 32 /* int: address of PS_STRINGS */ #define KERN_USRSTACK 33 /* int: address of USRSTACK */ #define KERN_LOGSIGEXIT 34 /* int: do we log sigexit procs? */ ! #define KERN_SENDFILE 35 /* sendfile statistics */ ! #define KERN_MAXID 36 /* number of valid kern ids */ #define CTL_KERN_NAMES { \ { 0, 0 }, \ *************** *** 369,374 **** --- 370,376 ---- { "ps_strings", CTLTYPE_INT }, \ { "usrstack", CTLTYPE_INT }, \ { "logsigexit", CTLTYPE_INT }, \ + { "sendfile", CTLTYPE_NODE }, \ } /* *************** *** 402,407 **** --- 404,418 ---- #define KIPC_MAX_DATALEN 7 /* int: max length of data? */ #define KIPC_MBSTAT 8 /* struct: mbuf usage statistics */ #define KIPC_NMBCLUSTERS 9 /* int: maximum mbuf clusters */ + + #define KSNDF_NSFBUFS 1 /* number of sendfile bufs */ + #define KSNDF_NSFPGS 2 /* number of sendfile cached pages */ + #define KSNDF_NLRUPGS 3 /* number of sendfile free pages */ + #define KSNDF_TOSEC 4 /* sendfile cache timeout in seconds */ + + extern int numsfpages; + extern int lrusfpages; + extern int sf_cache_timeout_sec; /* * CTL_HW identifiers