xref: /illumos-gate/usr/src/uts/common/fs/swapfs/swap_vnops.c (revision b6805bf78d2bbbeeaea8909a05623587b42d58b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/buf.h>
29 #include <sys/cred.h>
30 #include <sys/errno.h>
31 #include <sys/vnode.h>
32 #include <sys/vfs_opreg.h>
33 #include <sys/cmn_err.h>
34 #include <sys/swap.h>
35 #include <sys/mman.h>
36 #include <sys/vmsystm.h>
37 #include <sys/vtrace.h>
38 #include <sys/debug.h>
39 #include <sys/sysmacros.h>
40 #include <sys/vm.h>
41 
42 #include <sys/fs/swapnode.h>
43 
44 #include <vm/seg.h>
45 #include <vm/page.h>
46 #include <vm/pvn.h>
47 #include <fs/fs_subr.h>
48 
49 #include <vm/seg_kp.h>
50 
51 /*
52  * Define the routines within this file.
53  */
54 static int	swap_getpage(struct vnode *vp, offset_t off, size_t len,
55     uint_t *protp, struct page **plarr, size_t plsz, struct seg *seg,
56     caddr_t addr, enum seg_rw rw, struct cred *cr, caller_context_t *ct);
57 static int	swap_putpage(struct vnode *vp, offset_t off, size_t len,
58     int flags, struct cred *cr, caller_context_t *ct);
59 static void	swap_inactive(struct vnode *vp, struct cred *cr,
60     caller_context_t *ct);
61 static void	swap_dispose(vnode_t *vp, page_t *pp, int fl, int dn,
62     cred_t *cr, caller_context_t *ct);
63 
64 static int	swap_getapage(struct vnode *vp, u_offset_t off, size_t len,
65     uint_t *protp, page_t **plarr, size_t plsz,
66     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr);
67 
68 int	swap_getconpage(struct vnode *vp, u_offset_t off, size_t len,
69     uint_t *protp, page_t **plarr, size_t plsz, page_t *conpp,
70     uint_t *pszc, spgcnt_t *nreloc, struct seg *seg, caddr_t addr,
71     enum seg_rw rw, struct cred *cr);
72 
73 static int 	swap_putapage(struct vnode *vp, page_t *pp, u_offset_t *off,
74     size_t *lenp, int flags, struct cred *cr);
75 
76 const fs_operation_def_t swap_vnodeops_template[] = {
77 	VOPNAME_INACTIVE,	{ .vop_inactive = swap_inactive },
78 	VOPNAME_GETPAGE,	{ .vop_getpage = swap_getpage },
79 	VOPNAME_PUTPAGE,	{ .vop_putpage = swap_putpage },
80 	VOPNAME_DISPOSE,	{ .vop_dispose = swap_dispose },
81 	VOPNAME_SETFL,		{ .error = fs_error },
82 	VOPNAME_POLL,		{ .error = fs_error },
83 	VOPNAME_PATHCONF,	{ .error = fs_error },
84 	VOPNAME_GETSECATTR,	{ .error = fs_error },
85 	VOPNAME_SHRLOCK,	{ .error = fs_error },
86 	NULL,			NULL
87 };
88 
89 vnodeops_t *swap_vnodeops;
90 
91 /* ARGSUSED */
92 static void
93 swap_inactive(
94 	struct vnode *vp,
95 	struct cred *cr,
96 	caller_context_t *ct)
97 {
98 	SWAPFS_PRINT(SWAP_VOPS, "swap_inactive: vp %x\n", vp, 0, 0, 0, 0);
99 }
100 
101 /*
102  * Return all the pages from [off..off+len] in given file
103  */
104 /*ARGSUSED*/
105 static int
106 swap_getpage(
107 	struct vnode *vp,
108 	offset_t off,
109 	size_t len,
110 	uint_t *protp,
111 	page_t *pl[],
112 	size_t plsz,
113 	struct seg *seg,
114 	caddr_t addr,
115 	enum seg_rw rw,
116 	struct cred *cr,
117 	caller_context_t *ct)
118 {
119 	int err;
120 
121 	SWAPFS_PRINT(SWAP_VOPS, "swap_getpage: vp %p, off %llx, len %lx\n",
122 	    (void *)vp, off, len, 0, 0);
123 
124 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETPAGE,
125 	    "swapfs getpage:vp %p off %llx len %ld",
126 	    (void *)vp, off, len);
127 
128 	if (len <= PAGESIZE) {
129 		err = swap_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
130 		    seg, addr, rw, cr);
131 	} else {
132 		err = pvn_getpages(swap_getapage, vp, (u_offset_t)off, len,
133 		    protp, pl, plsz, seg, addr, rw, cr);
134 	}
135 
136 	return (err);
137 }
138 
139 /*
140  * Called from pvn_getpages or swap_getpage to get a particular page.
141  */
142 /*ARGSUSED*/
143 static int
144 swap_getapage(
145 	struct vnode *vp,
146 	u_offset_t off,
147 	size_t len,
148 	uint_t *protp,
149 	page_t *pl[],
150 	size_t plsz,
151 	struct seg *seg,
152 	caddr_t addr,
153 	enum seg_rw rw,
154 	struct cred *cr)
155 {
156 	struct page *pp, *rpp;
157 	int flags;
158 	int err = 0;
159 	struct vnode *pvp = NULL;
160 	u_offset_t poff;
161 	int flag_noreloc;
162 	se_t lock;
163 	extern int kcage_on;
164 	int upgrade = 0;
165 
166 	SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n",
167 	    vp, off, len, 0, 0);
168 
169 	/*
170 	 * Until there is a call-back mechanism to cause SEGKP
171 	 * pages to be unlocked, make them non-relocatable.
172 	 */
173 	if (SEG_IS_SEGKP(seg))
174 		flag_noreloc = PG_NORELOC;
175 	else
176 		flag_noreloc = 0;
177 
178 	if (protp != NULL)
179 		*protp = PROT_ALL;
180 
181 	lock = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
182 
183 again:
184 	if (pp = page_lookup(vp, off, lock)) {
185 		/*
186 		 * In very rare instances, a segkp page may have been
187 		 * relocated outside of the kernel by the kernel cage
188 		 * due to the window between page_unlock() and
189 		 * VOP_PUTPAGE() in segkp_unlock().  Due to the
190 		 * rareness of these occurances, the solution is to
191 		 * relocate the page to a P_NORELOC page.
192 		 */
193 		if (flag_noreloc != 0) {
194 			if (!PP_ISNORELOC(pp) && kcage_on) {
195 				if (lock != SE_EXCL) {
196 					upgrade = 1;
197 					if (!page_tryupgrade(pp)) {
198 						page_unlock(pp);
199 						lock = SE_EXCL;
200 						goto again;
201 					}
202 				}
203 
204 				if (page_relocate_cage(&pp, &rpp) != 0)
205 					panic("swap_getapage: "
206 					    "page_relocate_cage failed");
207 
208 				pp = rpp;
209 			}
210 		}
211 
212 		if (pl) {
213 			if (upgrade)
214 				page_downgrade(pp);
215 
216 			pl[0] = pp;
217 			pl[1] = NULL;
218 		} else {
219 			page_unlock(pp);
220 		}
221 	} else {
222 		pp = page_create_va(vp, off, PAGESIZE,
223 		    PG_WAIT | PG_EXCL | flag_noreloc,
224 		    seg, addr);
225 		/*
226 		 * Someone raced in and created the page after we did the
227 		 * lookup but before we did the create, so go back and
228 		 * try to look it up again.
229 		 */
230 		if (pp == NULL)
231 			goto again;
232 		if (rw != S_CREATE) {
233 			err = swap_getphysname(vp, off, &pvp, &poff);
234 			if (pvp) {
235 				struct anon *ap;
236 				kmutex_t *ahm;
237 
238 				flags = (pl == NULL ? B_ASYNC|B_READ : B_READ);
239 				err = VOP_PAGEIO(pvp, pp, poff,
240 				    PAGESIZE, flags, cr, NULL);
241 
242 				if (!err) {
243 					ahm = AH_MUTEX(vp, off);
244 					mutex_enter(ahm);
245 
246 					ap = swap_anon(vp, off);
247 					if (ap == NULL) {
248 						panic("swap_getapage:"
249 						    " null anon");
250 					}
251 
252 					if (ap->an_pvp == pvp &&
253 					    ap->an_poff == poff) {
254 						swap_phys_free(pvp, poff,
255 						    PAGESIZE);
256 						ap->an_pvp = NULL;
257 						ap->an_poff = NULL;
258 						hat_setmod(pp);
259 					}
260 
261 					mutex_exit(ahm);
262 				}
263 			} else {
264 				if (!err)
265 					pagezero(pp, 0, PAGESIZE);
266 
267 				/*
268 				 * If it's a fault ahead, release page_io_lock
269 				 * and SE_EXCL we grabbed in page_create_va
270 				 *
271 				 * If we are here, we haven't called VOP_PAGEIO
272 				 * and thus calling pvn_read_done(pp, B_READ)
273 				 * below may mislead that we tried i/o. Besides,
274 				 * in case of async, pvn_read_done() should
275 				 * not be called by *getpage()
276 				 */
277 				if (pl == NULL) {
278 					/*
279 					 * swap_getphysname can return error
280 					 * only when we are getting called from
281 					 * swapslot_free which passes non-NULL
282 					 * pl to VOP_GETPAGE.
283 					 */
284 					ASSERT(err == 0);
285 					page_io_unlock(pp);
286 					page_unlock(pp);
287 				}
288 			}
289 		}
290 
291 		ASSERT(pp != NULL);
292 
293 		if (err && pl)
294 			pvn_read_done(pp, B_ERROR);
295 
296 		if (!err && pl)
297 			pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
298 	}
299 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
300 	    "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
301 	return (err);
302 }
303 
304 /*
305  * Called from large page anon routines only! This is an ugly hack where
306  * the anon layer directly calls into swapfs with a preallocated large page.
307  * Another method would have been to change to VOP and add an extra arg for
308  * the preallocated large page. This all could be cleaned up later when we
309  * solve the anonymous naming problem and no longer need to loop across of
310  * the VOP in PAGESIZE increments to fill in or initialize a large page as
311  * is done today. I think the latter is better since it avoid a change to
312  * the VOP interface that could later be avoided.
313  */
314 int
315 swap_getconpage(
316 	struct vnode *vp,
317 	u_offset_t off,
318 	size_t len,
319 	uint_t *protp,
320 	page_t *pl[],
321 	size_t plsz,
322 	page_t	*conpp,
323 	uint_t	*pszc,
324 	spgcnt_t *nreloc,
325 	struct seg *seg,
326 	caddr_t addr,
327 	enum seg_rw rw,
328 	struct cred *cr)
329 {
330 	struct page	*pp;
331 	int 		err = 0;
332 	struct vnode	*pvp = NULL;
333 	u_offset_t	poff;
334 
335 	ASSERT(len == PAGESIZE);
336 	ASSERT(pl != NULL);
337 	ASSERT(plsz == PAGESIZE);
338 	ASSERT(protp == NULL);
339 	ASSERT(nreloc != NULL);
340 	ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */
341 	SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n",
342 	    vp, off, len, 0, 0);
343 
344 	/*
345 	 * If we are not using a preallocated page then we know one already
346 	 * exists. So just let the old code handle it.
347 	 */
348 	if (conpp == NULL) {
349 		err = swap_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
350 		    seg, addr, rw, cr);
351 		return (err);
352 	}
353 	ASSERT(conpp->p_szc != 0);
354 	ASSERT(PAGE_EXCL(conpp));
355 
356 
357 	ASSERT(conpp->p_next == conpp);
358 	ASSERT(conpp->p_prev == conpp);
359 	ASSERT(!PP_ISAGED(conpp));
360 	ASSERT(!PP_ISFREE(conpp));
361 
362 	*nreloc = 0;
363 	pp = page_lookup_create(vp, off, SE_SHARED, conpp, nreloc, 0);
364 
365 	/*
366 	 * If existing page is found we may need to relocate.
367 	 */
368 	if (pp != conpp) {
369 		ASSERT(rw != S_CREATE);
370 		ASSERT(pszc != NULL);
371 		ASSERT(PAGE_SHARED(pp));
372 		if (pp->p_szc < conpp->p_szc) {
373 			*pszc = pp->p_szc;
374 			page_unlock(pp);
375 			err = -1;
376 		} else if (pp->p_szc > conpp->p_szc &&
377 		    seg->s_szc > conpp->p_szc) {
378 			*pszc = MIN(pp->p_szc, seg->s_szc);
379 			page_unlock(pp);
380 			err = -2;
381 		} else {
382 			pl[0] = pp;
383 			pl[1] = NULL;
384 			if (page_pptonum(pp) &
385 			    (page_get_pagecnt(conpp->p_szc) - 1))
386 				cmn_err(CE_PANIC, "swap_getconpage: no root");
387 		}
388 		return (err);
389 	}
390 
391 	ASSERT(PAGE_EXCL(pp));
392 
393 	if (*nreloc != 0) {
394 		ASSERT(rw != S_CREATE);
395 		pl[0] = pp;
396 		pl[1] = NULL;
397 		return (0);
398 	}
399 
400 	*nreloc = 1;
401 
402 	/*
403 	 * If necessary do the page io.
404 	 */
405 	if (rw != S_CREATE) {
406 		/*
407 		 * Since we are only called now on behalf of an
408 		 * address space operation it's impossible for
409 		 * us to fail unlike swap_getapge() which
410 		 * also gets called from swapslot_free().
411 		 */
412 		if (swap_getphysname(vp, off, &pvp, &poff)) {
413 			cmn_err(CE_PANIC,
414 			    "swap_getconpage: swap_getphysname failed!");
415 		}
416 
417 		if (pvp != NULL) {
418 			err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
419 			    cr, NULL);
420 			if (err == 0) {
421 				struct anon *ap;
422 				kmutex_t *ahm;
423 
424 				ahm = AH_MUTEX(vp, off);
425 				mutex_enter(ahm);
426 				ap = swap_anon(vp, off);
427 				if (ap == NULL)
428 					panic("swap_getconpage: null anon");
429 				if (ap->an_pvp != pvp || ap->an_poff != poff)
430 					panic("swap_getconpage: bad anon");
431 
432 				swap_phys_free(pvp, poff, PAGESIZE);
433 				ap->an_pvp = NULL;
434 				ap->an_poff = NULL;
435 				hat_setmod(pp);
436 				mutex_exit(ahm);
437 			}
438 		} else {
439 			pagezero(pp, 0, PAGESIZE);
440 		}
441 	}
442 
443 	/*
444 	 * Normally we would let pvn_read_done() destroy
445 	 * the page on IO error. But since this is a preallocated
446 	 * page we'll let the anon layer handle it.
447 	 */
448 	page_io_unlock(pp);
449 	if (err != 0)
450 		page_hashout(pp, NULL);
451 	ASSERT(pp->p_next == pp);
452 	ASSERT(pp->p_prev == pp);
453 
454 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
455 	    "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
456 
457 	pl[0] = pp;
458 	pl[1] = NULL;
459 	return (err);
460 }
461 
462 /* Async putpage klustering stuff */
463 int sw_pending_size;
464 extern int klustsize;
465 extern struct async_reqs *sw_getreq();
466 extern void sw_putreq(struct async_reqs *);
467 extern void sw_putbackreq(struct async_reqs *);
468 extern struct async_reqs *sw_getfree();
469 extern void sw_putfree(struct async_reqs *);
470 
471 static size_t swap_putpagecnt, swap_pagespushed;
472 static size_t swap_otherfail, swap_otherpages;
473 static size_t swap_klustfail, swap_klustpages;
474 static size_t swap_getiofail, swap_getiopages;
475 
476 /*
477  * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
478  * If len == 0, do from off to EOF.
479  */
480 static int swap_nopage = 0;	/* Don't do swap_putpage's if set */
481 
482 /* ARGSUSED */
483 static int
484 swap_putpage(
485 	struct vnode *vp,
486 	offset_t off,
487 	size_t len,
488 	int flags,
489 	struct cred *cr,
490 	caller_context_t *ct)
491 {
492 	page_t *pp;
493 	u_offset_t io_off;
494 	size_t io_len = 0;
495 	int err = 0;
496 	int nowait;
497 	struct async_reqs *arg;
498 
499 	if (swap_nopage)
500 		return (0);
501 
502 	ASSERT(vp->v_count != 0);
503 
504 	nowait = flags & B_PAGE_NOWAIT;
505 
506 	/*
507 	 * Clear force flag so that p_lckcnt pages are not invalidated.
508 	 */
509 	flags &= ~(B_FORCE | B_PAGE_NOWAIT);
510 
511 	SWAPFS_PRINT(SWAP_VOPS,
512 	    "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
513 	    (void *)vp, off, len, flags, 0);
514 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_PUTPAGE,
515 	    "swapfs putpage:vp %p off %llx len %ld", (void *)vp, off, len);
516 
517 	if (vp->v_flag & VNOMAP)
518 		return (ENOSYS);
519 
520 	if (!vn_has_cached_data(vp))
521 		return (0);
522 
523 	if (len == 0) {
524 		if (curproc == proc_pageout)
525 			cmn_err(CE_PANIC, "swapfs: pageout can't block");
526 
527 		/* Search the entire vp list for pages >= off. */
528 		err = pvn_vplist_dirty(vp, (u_offset_t)off, swap_putapage,
529 		    flags, cr);
530 	} else {
531 		u_offset_t eoff;
532 
533 		/*
534 		 * Loop over all offsets in the range [off...off + len]
535 		 * looking for pages to deal with.
536 		 */
537 		eoff = off + len;
538 		for (io_off = (u_offset_t)off; io_off < eoff;
539 		    io_off += io_len) {
540 			/*
541 			 * If we run out of the async req slot, put the page
542 			 * now instead of queuing.
543 			 */
544 			if (flags == (B_ASYNC | B_FREE) &&
545 			    sw_pending_size < klustsize &&
546 			    (arg = sw_getfree())) {
547 				/*
548 				 * If we are clustering, we should allow
549 				 * pageout to feed us more pages because # of
550 				 * pushes is limited by # of I/Os, and one
551 				 * cluster is considered to be one I/O.
552 				 */
553 				if (pushes)
554 					pushes--;
555 
556 				arg->a_vp = vp;
557 				arg->a_off = io_off;
558 				arg->a_len = PAGESIZE;
559 				arg->a_flags = B_ASYNC | B_FREE;
560 				arg->a_cred = kcred;
561 				sw_putreq(arg);
562 				io_len = PAGESIZE;
563 				continue;
564 			}
565 			/*
566 			 * If we are not invalidating pages, use the
567 			 * routine page_lookup_nowait() to prevent
568 			 * reclaiming them from the free list.
569 			 */
570 			if (!nowait && ((flags & B_INVAL) ||
571 			    (flags & (B_ASYNC | B_FREE)) == B_FREE))
572 				pp = page_lookup(vp, io_off, SE_EXCL);
573 			else
574 				pp = page_lookup_nowait(vp, io_off,
575 				    (flags & (B_FREE | B_INVAL)) ?
576 				    SE_EXCL : SE_SHARED);
577 
578 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
579 				io_len = PAGESIZE;
580 			else {
581 				err = swap_putapage(vp, pp, &io_off, &io_len,
582 				    flags, cr);
583 				if (err != 0)
584 					break;
585 			}
586 		}
587 	}
588 	/* If invalidating, verify all pages on vnode list are gone. */
589 	if (err == 0 && off == 0 && len == 0 &&
590 	    (flags & B_INVAL) && vn_has_cached_data(vp)) {
591 		cmn_err(CE_WARN,
592 		    "swap_putpage: B_INVAL, pages not gone");
593 	}
594 	return (err);
595 }
596 
597 /*
598  * Write out a single page.
599  * For swapfs this means choose a physical swap slot and write the page
600  * out using VOP_PAGEIO.
601  * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
602  * swapfs pages, a bunch of contiguous swap slots and then write them
603  * all out in one clustered i/o.
604  */
605 /*ARGSUSED*/
606 static int
607 swap_putapage(
608 	struct vnode *vp,
609 	page_t *pp,
610 	u_offset_t *offp,
611 	size_t *lenp,
612 	int flags,
613 	struct cred *cr)
614 {
615 	int err;
616 	struct vnode *pvp;
617 	u_offset_t poff, off;
618 	u_offset_t doff;
619 	size_t dlen;
620 	size_t klsz = 0;
621 	u_offset_t klstart = 0;
622 	struct vnode *klvp = NULL;
623 	page_t *pplist;
624 	se_t se;
625 	struct async_reqs *arg;
626 	size_t swap_klustsize;
627 
628 	/*
629 	 * This check is added for callers who access swap_putpage with len = 0.
630 	 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
631 	 * And it's necessary to do the same queuing if users have the same
632 	 * B_ASYNC|B_FREE flags on.
633 	 */
634 	if (flags == (B_ASYNC | B_FREE) &&
635 	    sw_pending_size < klustsize && (arg = sw_getfree())) {
636 
637 		hat_setmod(pp);
638 		page_io_unlock(pp);
639 		page_unlock(pp);
640 
641 		arg->a_vp = vp;
642 		arg->a_off = pp->p_offset;
643 		arg->a_len = PAGESIZE;
644 		arg->a_flags = B_ASYNC | B_FREE;
645 		arg->a_cred = kcred;
646 		sw_putreq(arg);
647 
648 		return (0);
649 	}
650 
651 	SWAPFS_PRINT(SWAP_PUTP,
652 	    "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
653 	    pp, vp, pp->p_offset, flags, 0);
654 
655 	ASSERT(PAGE_LOCKED(pp));
656 
657 	off = pp->p_offset;
658 
659 	doff = off;
660 	dlen = PAGESIZE;
661 
662 	if (err = swap_newphysname(vp, off, &doff, &dlen, &pvp, &poff)) {
663 		err = (flags == (B_ASYNC | B_FREE) ? ENOMEM : 0);
664 		hat_setmod(pp);
665 		page_io_unlock(pp);
666 		page_unlock(pp);
667 		goto out;
668 	}
669 
670 	klvp = pvp;
671 	klstart = poff;
672 	pplist = pp;
673 	/*
674 	 * If this is ASYNC | FREE and we've accumulated a bunch of such
675 	 * pending requests, kluster.
676 	 */
677 	if (flags == (B_ASYNC | B_FREE))
678 		swap_klustsize = klustsize;
679 	else
680 		swap_klustsize = PAGESIZE;
681 	se = (flags & B_FREE ? SE_EXCL : SE_SHARED);
682 	klsz = PAGESIZE;
683 	while (klsz < swap_klustsize) {
684 		if ((arg = sw_getreq()) == NULL) {
685 			swap_getiofail++;
686 			swap_getiopages += btop(klsz);
687 			break;
688 		}
689 		ASSERT(vn_matchops(arg->a_vp, swap_vnodeops));
690 		vp = arg->a_vp;
691 		off = arg->a_off;
692 
693 		if ((pp = page_lookup_nowait(vp, off, se)) == NULL) {
694 			swap_otherfail++;
695 			swap_otherpages += btop(klsz);
696 			sw_putfree(arg);
697 			break;
698 		}
699 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0) {
700 			sw_putfree(arg);
701 			continue;
702 		}
703 		/* Get new physical backing store for the page */
704 		doff = off;
705 		dlen = PAGESIZE;
706 		if (err = swap_newphysname(vp, off, &doff, &dlen,
707 		    &pvp, &poff)) {
708 			swap_otherfail++;
709 			swap_otherpages += btop(klsz);
710 			hat_setmod(pp);
711 			page_io_unlock(pp);
712 			page_unlock(pp);
713 			sw_putbackreq(arg);
714 			break;
715 		}
716 		/* Try to cluster new physical name with previous ones */
717 		if (klvp == pvp && poff == klstart + klsz) {
718 			klsz += PAGESIZE;
719 			page_add(&pplist, pp);
720 			pplist = pplist->p_next;
721 			sw_putfree(arg);
722 		} else if (klvp == pvp && poff == klstart - PAGESIZE) {
723 			klsz += PAGESIZE;
724 			klstart -= PAGESIZE;
725 			page_add(&pplist, pp);
726 			sw_putfree(arg);
727 		} else {
728 			swap_klustfail++;
729 			swap_klustpages += btop(klsz);
730 			hat_setmod(pp);
731 			page_io_unlock(pp);
732 			page_unlock(pp);
733 			sw_putbackreq(arg);
734 			break;
735 		}
736 	}
737 
738 	err = VOP_PAGEIO(klvp, pplist, klstart, klsz,
739 	    B_WRITE | flags, cr, NULL);
740 
741 	if ((flags & B_ASYNC) == 0)
742 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
743 
744 	/* Statistics */
745 	if (!err) {
746 		swap_putpagecnt++;
747 		swap_pagespushed += btop(klsz);
748 	}
749 out:
750 	TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE,
751 	    "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
752 	    vp, klvp, klstart, klsz);
753 	if (err && err != ENOMEM)
754 		cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err);
755 	if (lenp)
756 		*lenp = PAGESIZE;
757 	return (err);
758 }
759 
760 static void
761 swap_dispose(
762 	vnode_t *vp,
763 	page_t *pp,
764 	int fl,
765 	int dn,
766 	cred_t *cr,
767 	caller_context_t *ct)
768 {
769 	int err;
770 	u_offset_t off = pp->p_offset;
771 	vnode_t *pvp;
772 	u_offset_t poff;
773 
774 	ASSERT(PAGE_EXCL(pp));
775 
776 	/*
777 	 * The caller will free/invalidate large page in one shot instead of
778 	 * one small page at a time.
779 	 */
780 	if (pp->p_szc != 0) {
781 		page_unlock(pp);
782 		return;
783 	}
784 
785 	err = swap_getphysname(vp, off, &pvp, &poff);
786 	if (!err && pvp != NULL)
787 		VOP_DISPOSE(pvp, pp, fl, dn, cr, ct);
788 	else
789 		fs_dispose(vp, pp, fl, dn, cr, ct);
790 }
791