xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_dispatch.c (revision c94be9439c4f0773ef60e2cec21d548359cfea20)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2018 Nexenta Systems, Inc.
29  */
30 
31 #include <sys/systm.h>
32 #include <sys/sdt.h>
33 #include <rpc/types.h>
34 #include <rpc/auth.h>
35 #include <rpc/auth_unix.h>
36 #include <rpc/auth_des.h>
37 #include <rpc/svc.h>
38 #include <rpc/xdr.h>
39 #include <nfs/nfs4.h>
40 #include <nfs/nfs_dispatch.h>
41 #include <nfs/nfs4_drc.h>
42 
43 #define	NFS4_MAX_MINOR_VERSION	0
44 
45 /*
46  * The default size of the duplicate request cache
47  */
48 uint32_t nfs4_drc_max = 8 * 1024;
49 
50 /*
51  * The number of buckets we'd like to hash the
52  * replies into.. do not change this on the fly.
53  */
54 uint32_t nfs4_drc_hash = 541;
55 
56 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
57 
58 /*
59  * Initialize a duplicate request cache.
60  */
61 rfs4_drc_t *
62 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
63 {
64 	rfs4_drc_t *drc;
65 	uint32_t   bki;
66 
67 	ASSERT(drc_size);
68 	ASSERT(drc_hash_size);
69 
70 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
71 
72 	drc->max_size = drc_size;
73 	drc->in_use = 0;
74 
75 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
76 
77 	drc->dr_hash = drc_hash_size;
78 
79 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
80 
81 	for (bki = 0; bki < drc_hash_size; bki++) {
82 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
83 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
84 	}
85 
86 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
87 	    offsetof(rfs4_dupreq_t, dr_next));
88 
89 	return (drc);
90 }
91 
92 /*
93  * Destroy a duplicate request cache.
94  */
95 void
96 rfs4_fini_drc(void)
97 {
98 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
99 	rfs4_drc_t *drc = nsrv4->nfs4_drc;
100 	rfs4_dupreq_t *drp, *drp_next;
101 
102 	/* iterate over the dr_cache and free the enties */
103 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
104 
105 		if (drp->dr_state == NFS4_DUP_REPLAY)
106 			rfs4_compound_free(&(drp->dr_res));
107 
108 		if (drp->dr_addr.buf != NULL)
109 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
110 
111 		drp_next = list_next(&(drc->dr_cache), drp);
112 
113 		kmem_free(drp, sizeof (rfs4_dupreq_t));
114 	}
115 
116 	mutex_destroy(&drc->lock);
117 	kmem_free(drc->dr_buckets,
118 	    sizeof (list_t)*drc->dr_hash);
119 	kmem_free(drc, sizeof (rfs4_drc_t));
120 }
121 
122 /*
123  * rfs4_dr_chstate:
124  *
125  * Change the state of a rfs4_dupreq. If it's not in transition
126  * to the FREE state, return. If we are moving to the FREE state
127  * then we need to clean up the compound results and move the entry
128  * to the end of the list.
129  */
130 void
131 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
132 {
133 	rfs4_drc_t *drc;
134 
135 	ASSERT(drp);
136 	ASSERT(drp->drc);
137 	ASSERT(drp->dr_bkt);
138 	ASSERT(MUTEX_HELD(&drp->drc->lock));
139 
140 	drp->dr_state = new_state;
141 
142 	if (new_state != NFS4_DUP_FREE)
143 		return;
144 
145 	drc = drp->drc;
146 
147 	/*
148 	 * Remove entry from the bucket and
149 	 * dr_cache list, free compound results.
150 	 */
151 	list_remove(drp->dr_bkt, drp);
152 	list_remove(&(drc->dr_cache), drp);
153 	rfs4_compound_free(&(drp->dr_res));
154 }
155 
156 /*
157  * rfs4_alloc_dr:
158  *
159  * Malloc a new one if we have not reached our maximum cache
160  * limit, otherwise pick an entry off the tail -- Use if it
161  * is marked as NFS4_DUP_FREE, or is an entry in the
162  * NFS4_DUP_REPLAY state.
163  */
164 rfs4_dupreq_t *
165 rfs4_alloc_dr(rfs4_drc_t *drc)
166 {
167 	rfs4_dupreq_t *drp_tail, *drp = NULL;
168 
169 	ASSERT(drc);
170 	ASSERT(MUTEX_HELD(&drc->lock));
171 
172 	/*
173 	 * Have we hit the cache limit yet ?
174 	 */
175 	if (drc->in_use < drc->max_size) {
176 		/*
177 		 * nope, so let's malloc a new one
178 		 */
179 		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
180 		drp->drc = drc;
181 		drc->in_use++;
182 		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
183 		return (drp);
184 	}
185 
186 	/*
187 	 * Cache is all allocated now traverse the list
188 	 * backwards to find one we can reuse.
189 	 */
190 	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
191 	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
192 
193 		switch (drp_tail->dr_state) {
194 
195 		case NFS4_DUP_FREE:
196 			list_remove(&(drc->dr_cache), drp_tail);
197 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
198 			    rfs4_dupreq_t *, drp_tail);
199 			return (drp_tail);
200 			/* NOTREACHED */
201 
202 		case NFS4_DUP_REPLAY:
203 			/* grab it. */
204 			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
205 			DTRACE_PROBE1(nfss__i__drc_replayclaim,
206 			    rfs4_dupreq_t *, drp_tail);
207 			return (drp_tail);
208 			/* NOTREACHED */
209 		}
210 	}
211 	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
212 	return (NULL);
213 }
214 
215 /*
216  * rfs4_find_dr:
217  *
218  * Search for an entry in the duplicate request cache by
219  * calculating the hash index based on the XID, and examining
220  * the entries in the hash bucket. If we find a match, return.
221  * Once we have searched the bucket we call rfs4_alloc_dr() to
222  * allocate a new entry, or reuse one that is available.
223  */
224 int
225 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
226 {
227 
228 	uint32_t	the_xid;
229 	list_t		*dr_bkt;
230 	rfs4_dupreq_t	*drp;
231 	int		bktdex;
232 
233 	/*
234 	 * Get the XID, calculate the bucket and search to
235 	 * see if we need to replay from the cache.
236 	 */
237 	the_xid = req->rq_xprt->xp_xid;
238 	bktdex = the_xid % drc->dr_hash;
239 
240 	dr_bkt = (list_t *)
241 	    &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
242 
243 	DTRACE_PROBE3(nfss__i__drc_bktdex,
244 	    int, bktdex,
245 	    uint32_t, the_xid,
246 	    list_t *, dr_bkt);
247 
248 	*dup = NULL;
249 
250 	mutex_enter(&drc->lock);
251 	/*
252 	 * Search the bucket for a matching xid and address.
253 	 */
254 	for (drp = list_head(dr_bkt); drp != NULL;
255 	    drp = list_next(dr_bkt, drp)) {
256 
257 		if (drp->dr_xid == the_xid &&
258 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
259 		    bcmp((caddr_t)drp->dr_addr.buf,
260 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
261 		    drp->dr_addr.len) == 0) {
262 
263 			/*
264 			 * Found a match so REPLAY the Reply
265 			 */
266 			if (drp->dr_state == NFS4_DUP_REPLAY) {
267 				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
268 				mutex_exit(&drc->lock);
269 				*dup = drp;
270 				DTRACE_PROBE1(nfss__i__drc_replay,
271 				    rfs4_dupreq_t *, drp);
272 				return (NFS4_DUP_REPLAY);
273 			}
274 
275 			/*
276 			 * This entry must be in transition, so return
277 			 * the 'pending' status.
278 			 */
279 			mutex_exit(&drc->lock);
280 			return (NFS4_DUP_PENDING);
281 		}
282 	}
283 
284 	drp = rfs4_alloc_dr(drc);
285 	mutex_exit(&drc->lock);
286 
287 	/*
288 	 * The DRC is full and all entries are in use. Upper function
289 	 * should error out this request and force the client to
290 	 * retransmit -- effectively this is a resource issue. NFSD
291 	 * threads tied up with native File System, or the cache size
292 	 * is too small for the server load.
293 	 */
294 	if (drp == NULL)
295 		return (NFS4_DUP_ERROR);
296 
297 	/*
298 	 * Init the state to NEW.
299 	 */
300 	drp->dr_state = NFS4_DUP_NEW;
301 
302 	/*
303 	 * If needed, resize the address buffer
304 	 */
305 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
306 		if (drp->dr_addr.buf != NULL)
307 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
308 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
309 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
310 		if (drp->dr_addr.buf == NULL) {
311 			/*
312 			 * If the malloc fails, mark the entry
313 			 * as free and put on the tail.
314 			 */
315 			drp->dr_addr.maxlen = 0;
316 			drp->dr_state = NFS4_DUP_FREE;
317 			mutex_enter(&drc->lock);
318 			list_insert_tail(&(drc->dr_cache), drp);
319 			mutex_exit(&drc->lock);
320 			return (NFS4_DUP_ERROR);
321 		}
322 	}
323 
324 
325 	/*
326 	 * Copy the address.
327 	 */
328 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
329 
330 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
331 	    (caddr_t)drp->dr_addr.buf,
332 	    drp->dr_addr.len);
333 
334 	drp->dr_xid = the_xid;
335 	drp->dr_bkt = dr_bkt;
336 
337 	/*
338 	 * Insert at the head of the bucket and
339 	 * the drc lists..
340 	 */
341 	mutex_enter(&drc->lock);
342 	list_insert_head(&drc->dr_cache, drp);
343 	list_insert_head(dr_bkt, drp);
344 	mutex_exit(&drc->lock);
345 
346 	*dup = drp;
347 
348 	return (NFS4_DUP_NEW);
349 }
350 
351 /*
352  *
353  * This function handles the duplicate request cache,
354  * NULL_PROC and COMPOUND procedure calls for NFSv4;
355  *
356  * Passed into this function are:-
357  *
358  *	disp	A pointer to our dispatch table entry
359  *	req	The request to process
360  *	xprt	The server transport handle
361  *	ap	A pointer to the arguments
362  *
363  *
364  * When appropriate this function is responsible for inserting
365  * the reply into the duplicate cache or replaying an existing
366  * cached reply.
367  *
368  * dr_stat	reflects the state of the duplicate request that
369  *		has been inserted into or retrieved from the cache
370  *
371  * drp		is the duplicate request entry
372  *
373  */
374 int
375 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req, SVCXPRT *xprt,
376     char *ap)
377 {
378 
379 	COMPOUND4res	 res_buf;
380 	COMPOUND4res	*rbp;
381 	COMPOUND4args	*cap;
382 	cred_t		*cr = NULL;
383 	int		 error = 0;
384 	int		 dis_flags = 0;
385 	int		 dr_stat = NFS4_NOT_DUP;
386 	rfs4_dupreq_t	*drp = NULL;
387 	int		 rv;
388 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
389 	rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
390 
391 	ASSERT(disp);
392 
393 	/*
394 	 * Short circuit the RPC_NULL proc.
395 	 */
396 	if (disp->dis_proc == rpc_null) {
397 		DTRACE_NFSV4_1(null__start, struct svc_req *, req);
398 		if (!svc_sendreply(xprt, xdr_void, NULL)) {
399 			DTRACE_NFSV4_1(null__done, struct svc_req *, req);
400 			svcerr_systemerr(xprt);
401 			return (1);
402 		}
403 		DTRACE_NFSV4_1(null__done, struct svc_req *, req);
404 		return (0);
405 	}
406 
407 	/* Only NFSv4 Compounds from this point onward */
408 
409 	rbp = &res_buf;
410 	cap = (COMPOUND4args *)ap;
411 
412 	/*
413 	 * Figure out the disposition of the whole COMPOUND
414 	 * and record it's IDEMPOTENTCY.
415 	 */
416 	rfs4_compound_flagproc(cap, &dis_flags);
417 
418 	/*
419 	 * If NON-IDEMPOTENT then we need to figure out if this
420 	 * request can be replied from the duplicate cache.
421 	 *
422 	 * If this is a new request then we need to insert the
423 	 * reply into the duplicate cache.
424 	 */
425 	if (!(dis_flags & RPC_IDEMPOTENT)) {
426 		/* look for a replay from the cache or allocate */
427 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
428 
429 		switch (dr_stat) {
430 
431 		case NFS4_DUP_ERROR:
432 			rfs4_resource_err(req, cap);
433 			return (1);
434 			/* NOTREACHED */
435 
436 		case NFS4_DUP_PENDING:
437 			/*
438 			 * reply has previously been inserted into the
439 			 * duplicate cache, however the reply has
440 			 * not yet been sent via svc_sendreply()
441 			 */
442 			return (1);
443 			/* NOTREACHED */
444 
445 		case NFS4_DUP_NEW:
446 			curthread->t_flag |= T_DONTPEND;
447 			/* NON-IDEMPOTENT proc call */
448 			rfs4_compound(cap, rbp, NULL, req, cr, &rv);
449 			curthread->t_flag &= ~T_DONTPEND;
450 
451 			if (rv)		/* short ckt sendreply on error */
452 				return (rv);
453 
454 			/*
455 			 * dr_res must be initialized before calling
456 			 * rfs4_dr_chstate (it frees the reply).
457 			 */
458 			drp->dr_res = res_buf;
459 			if (curthread->t_flag & T_WOULDBLOCK) {
460 				curthread->t_flag &= ~T_WOULDBLOCK;
461 				/*
462 				 * mark this entry as FREE and plop
463 				 * on the end of the cache list
464 				 */
465 				mutex_enter(&drp->drc->lock);
466 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
467 				list_insert_tail(&(drp->drc->dr_cache), drp);
468 				mutex_exit(&drp->drc->lock);
469 				return (1);
470 			}
471 			break;
472 
473 		case NFS4_DUP_REPLAY:
474 			/* replay from the cache */
475 			rbp = &(drp->dr_res);
476 			break;
477 		}
478 	} else {
479 		curthread->t_flag |= T_DONTPEND;
480 		/* IDEMPOTENT proc call */
481 		rfs4_compound(cap, rbp, NULL, req, cr, &rv);
482 		curthread->t_flag &= ~T_DONTPEND;
483 
484 		if (rv)		/* short ckt sendreply on error */
485 			return (rv);
486 
487 		if (curthread->t_flag & T_WOULDBLOCK) {
488 			curthread->t_flag &= ~T_WOULDBLOCK;
489 			return (1);
490 		}
491 	}
492 
493 	/*
494 	 * Send out the replayed reply or the 'real' one.
495 	 */
496 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
497 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
498 		    struct svc_req *, xprt,
499 		    char *, rbp);
500 		svcerr_systemerr(xprt);
501 		error++;
502 	}
503 
504 	/*
505 	 * If this reply was just inserted into the duplicate cache
506 	 * or it was replayed from the dup cache; (re)mark it as
507 	 * available for replay
508 	 *
509 	 * At first glance, this 'if' statement seems a little strange;
510 	 * testing for NFS4_DUP_REPLAY, and then calling...
511 	 *
512 	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
513 	 *
514 	 * ... but notice that we are checking dr_stat, and not the
515 	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
516 	 * we do that so that we know not to prematurely reap it whilst
517 	 * we resent it to the client.
518 	 *
519 	 */
520 	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
521 		mutex_enter(&drp->drc->lock);
522 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
523 		mutex_exit(&drp->drc->lock);
524 	} else if (dr_stat == NFS4_NOT_DUP) {
525 		rfs4_compound_free(rbp);
526 	}
527 
528 	return (error);
529 }
530 
531 bool_t
532 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
533 {
534 	COMPOUND4args *argsp;
535 	COMPOUND4res res_buf, *resp;
536 
537 	if (req->rq_vers != 4)
538 		return (FALSE);
539 
540 	argsp = (COMPOUND4args *)args;
541 
542 	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
543 		return (FALSE);
544 
545 	resp = &res_buf;
546 
547 	/*
548 	 * Form a reply tag by copying over the request tag.
549 	 */
550 	resp->tag.utf8string_len = argsp->tag.utf8string_len;
551 	if (argsp->tag.utf8string_len != 0) {
552 		resp->tag.utf8string_val =
553 		    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
554 		bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
555 		    resp->tag.utf8string_len);
556 	} else {
557 		resp->tag.utf8string_val = NULL;
558 	}
559 	resp->array_len = 0;
560 	resp->array = NULL;
561 	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
562 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
563 		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
564 		    SVCXPRT *, xprt, char *, resp);
565 		svcerr_systemerr(xprt);
566 	}
567 	rfs4_compound_free(resp);
568 	return (TRUE);
569 }
570 
571 void
572 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
573 {
574 	COMPOUND4res res_buf, *rbp;
575 	nfs_resop4 *resop;
576 	PUTFH4res *resp;
577 
578 	rbp = &res_buf;
579 
580 	/*
581 	 * Form a reply tag by copying over the request tag.
582 	 */
583 	rbp->tag.utf8string_len = argsp->tag.utf8string_len;
584 	if (argsp->tag.utf8string_len != 0) {
585 		rbp->tag.utf8string_val =
586 		    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
587 		bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
588 		    rbp->tag.utf8string_len);
589 	} else {
590 		rbp->tag.utf8string_val = NULL;
591 	}
592 
593 	rbp->array_len = 1;
594 	rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
595 	    KM_SLEEP);
596 	resop = &rbp->array[0];
597 	resop->resop = argsp->array[0].argop;	/* copy first op over */
598 
599 	/* Any op will do, just need to access status field */
600 	resp = &resop->nfs_resop4_u.opputfh;
601 
602 	/*
603 	 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
604 	 * Note that all op numbers in the compound array were already
605 	 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
606 	 */
607 	resp->status = (resop->resop == OP_ILLEGAL ?
608 	    NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
609 
610 	/* compound status is same as first op status */
611 	rbp->status = resp->status;
612 
613 	if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
614 		DTRACE_PROBE2(nfss__rsrc_err__sendfail,
615 		    struct svc_req *, req->rq_xprt, char *, rbp);
616 		svcerr_systemerr(req->rq_xprt);
617 	}
618 
619 	UTF8STRING_FREE(rbp->tag);
620 	kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
621 }
622