xref: /illumos-gate/usr/src/uts/common/fs/tmpfs/tmp_dir.c (revision 02b17e23cf5bf66a5ea787e066ae3d1aa49bd856)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/time.h>
33 #include <sys/vfs.h>
34 #include <sys/vnode.h>
35 #include <sys/errno.h>
36 #include <sys/cmn_err.h>
37 #include <sys/cred.h>
38 #include <sys/stat.h>
39 #include <sys/debug.h>
40 #include <sys/policy.h>
41 #include <sys/fs/tmpnode.h>
42 #include <sys/fs/tmp.h>
43 #include <sys/vtrace.h>
44 
45 static int tdircheckpath(struct tmpnode *, struct tmpnode *, struct cred *);
46 static int tdirrename(struct tmpnode *, struct tmpnode *, struct tmpnode *,
47 	char *, struct tmpnode *, struct tdirent *, struct cred *);
48 static void tdirfixdotdot(struct tmpnode *, struct tmpnode *, struct tmpnode *);
49 static int tdirmaketnode(struct tmpnode *, struct tmount *, struct vattr *,
50 	enum de_op, struct tmpnode **, struct cred *);
51 static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
52 	enum de_op, struct tmpnode *);
53 
54 
55 #define	T_HASH_SIZE	8192		/* must be power of 2 */
56 #define	T_MUTEX_SIZE	64
57 
58 /* Non-static so compilers won't constant-fold these away. */
59 clock_t tmpfs_rename_backoff_delay = 1;
60 unsigned int tmpfs_rename_backoff_tries = 0;
61 unsigned long tmpfs_rename_loops = 0;
62 
63 static struct tdirent	*t_hashtable[T_HASH_SIZE];
64 static kmutex_t		 t_hashmutex[T_MUTEX_SIZE];
65 
66 #define	T_HASH_INDEX(a)		((a) & (T_HASH_SIZE-1))
67 #define	T_MUTEX_INDEX(a)	((a) & (T_MUTEX_SIZE-1))
68 
69 #define	TMPFS_HASH(tp, name, hash)				\
70 	{							\
71 		char Xc, *Xcp;					\
72 		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
73 		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
74 			hash = (hash << 4) + hash + (uint_t)Xc;	\
75 	}
76 
77 void
78 tmpfs_hash_init(void)
79 {
80 	int	ix;
81 
82 	for (ix = 0; ix < T_MUTEX_SIZE; ix++)
83 		mutex_init(&t_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
84 }
85 
86 /*
87  * This routine is where the rubber meets the road for identities.
88  */
89 static void
90 tmpfs_hash_in(struct tdirent *t)
91 {
92 	uint_t		hash;
93 	struct tdirent	**prevpp;
94 	kmutex_t	*t_hmtx;
95 
96 	TMPFS_HASH(t->td_parent, t->td_name, hash);
97 	t->td_hash = hash;
98 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
99 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
100 	mutex_enter(t_hmtx);
101 	t->td_link = *prevpp;
102 	*prevpp = t;
103 	mutex_exit(t_hmtx);
104 }
105 
106 /*
107  * Remove tdirent *t from the hash list.
108  */
109 static void
110 tmpfs_hash_out(struct tdirent *t)
111 {
112 	uint_t		hash;
113 	struct tdirent	**prevpp;
114 	kmutex_t	*t_hmtx;
115 
116 	hash = t->td_hash;
117 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
118 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
119 	mutex_enter(t_hmtx);
120 	while (*prevpp != t)
121 		prevpp = &(*prevpp)->td_link;
122 	*prevpp = t->td_link;
123 	mutex_exit(t_hmtx);
124 }
125 
126 /*
127  * Currently called by tdirrename() only.
128  * rename operation needs to be done with lock held, to ensure that
129  * no other operations can access the tmpnode at the same instance.
130  */
131 static void
132 tmpfs_hash_change(struct tdirent *tdp, struct tmpnode *fromtp)
133 {
134 	uint_t		hash;
135 	kmutex_t	*t_hmtx;
136 
137 	hash = tdp->td_hash;
138 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
139 	mutex_enter(t_hmtx);
140 	tdp->td_tmpnode = fromtp;
141 	mutex_exit(t_hmtx);
142 }
143 
144 static struct tdirent *
145 tmpfs_hash_lookup(char *name, struct tmpnode *parent, uint_t hold,
146 	struct tmpnode **found)
147 {
148 	struct tdirent	*l;
149 	uint_t		hash;
150 	kmutex_t	*t_hmtx;
151 	struct tmpnode	*tnp;
152 
153 	TMPFS_HASH(parent, name, hash);
154 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
155 	mutex_enter(t_hmtx);
156 	l = t_hashtable[T_HASH_INDEX(hash)];
157 	while (l) {
158 		if ((l->td_hash == hash) &&
159 		    (l->td_parent == parent) &&
160 		    (strcmp(l->td_name, name) == 0)) {
161 			/*
162 			 * We need to make sure that the tmpnode that
163 			 * we put a hold on is the same one that we pass back.
164 			 * Hence, temporary variable tnp is necessary.
165 			 */
166 			tnp = l->td_tmpnode;
167 			if (hold) {
168 				ASSERT(tnp);
169 				tmpnode_hold(tnp);
170 			}
171 			if (found)
172 				*found = tnp;
173 			mutex_exit(t_hmtx);
174 			return (l);
175 		} else {
176 			l = l->td_link;
177 		}
178 	}
179 	mutex_exit(t_hmtx);
180 	return (NULL);
181 }
182 
183 /*
184  * Search directory 'parent' for entry 'name'.
185  *
186  * The calling thread can't hold the write version
187  * of the rwlock for the directory being searched
188  *
189  * 0 is returned on success and *foundtp points
190  * to the found tmpnode with its vnode held.
191  */
192 int
193 tdirlookup(
194 	struct tmpnode *parent,
195 	char *name,
196 	struct tmpnode **foundtp,
197 	struct cred *cred)
198 {
199 	int error;
200 
201 	*foundtp = NULL;
202 	if (parent->tn_type != VDIR)
203 		return (ENOTDIR);
204 
205 	if ((error = tmp_taccess(parent, VEXEC, cred)))
206 		return (error);
207 
208 	if (*name == '\0') {
209 		tmpnode_hold(parent);
210 		*foundtp = parent;
211 		return (0);
212 	}
213 
214 	/*
215 	 * Search the directory for the matching name
216 	 * We need the lock protecting the tn_dir list
217 	 * so that it doesn't change out from underneath us.
218 	 * tmpfs_hash_lookup() will pass back the tmpnode
219 	 * with a hold on it.
220 	 */
221 
222 	if (tmpfs_hash_lookup(name, parent, 1, foundtp) != NULL) {
223 		ASSERT(*foundtp);
224 		return (0);
225 	}
226 
227 	return (ENOENT);
228 }
229 
230 /*
231  * Enter a directory entry for 'name' and 'tp' into directory 'dir'
232  *
233  * Returns 0 on success.
234  */
235 int
236 tdirenter(
237 	struct tmount	*tm,
238 	struct tmpnode	*dir,		/* target directory to make entry in */
239 	char		*name,		/* name of entry */
240 	enum de_op	op,		/* entry operation */
241 	struct tmpnode	*fromparent,	/* source directory if rename */
242 	struct tmpnode	*tp,		/* source tmpnode, if link/rename */
243 	struct vattr	*va,
244 	struct tmpnode	**tpp,		/* return tmpnode, if create/mkdir */
245 	struct cred	*cred,
246 	caller_context_t *ctp)
247 {
248 	struct tdirent *tdp;
249 	struct tmpnode *found = NULL;
250 	int error = 0;
251 	char *s;
252 
253 	/*
254 	 * tn_rwlock is held to serialize direnter and dirdeletes
255 	 */
256 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
257 	ASSERT(dir->tn_type == VDIR);
258 
259 	/*
260 	 * Don't allow '/' characters in pathname component
261 	 * (thus in ufs_direnter()).
262 	 */
263 	for (s = name; *s; s++)
264 		if (*s == '/')
265 			return (EACCES);
266 
267 	if (name[0] == '\0')
268 		panic("tdirenter: NULL name");
269 
270 	/*
271 	 * For link and rename lock the source entry and check the link count
272 	 * to see if it has been removed while it was unlocked.
273 	 */
274 	if (op == DE_LINK || op == DE_RENAME) {
275 		if (tp != dir) {
276 			unsigned int tries = 0;
277 
278 			/*
279 			 * If we are acquiring tp->tn_rwlock (for SOURCE)
280 			 * inside here, we must consider the following:
281 			 *
282 			 * - dir->tn_rwlock (TARGET) is already HELD (see
283 			 * above ASSERT()).
284 			 *
285 			 * - It is possible our SOURCE is a parent of our
286 			 * TARGET. Yes it's unusual, but it will return an
287 			 * error below via tdircheckpath().
288 			 *
289 			 * - It is also possible that another thread,
290 			 * concurrent to this one, is performing
291 			 * rmdir(TARGET), which means it will first acquire
292 			 * SOURCE's lock, THEN acquire TARGET's lock, which
293 			 * could result in this thread holding TARGET and
294 			 * trying for SOURCE, but the other thread holding
295 			 * SOURCE and trying for TARGET.  This is deadlock,
296 			 * and it's inducible.
297 			 *
298 			 * To prevent this, we borrow some techniques from UFS
299 			 * and rw_tryenter(), delaying if we fail, and
300 			 * if someone tweaks the number of backoff tries to be
301 			 * nonzero, return EBUSY after that number of tries.
302 			 */
303 			while (!rw_tryenter(&tp->tn_rwlock, RW_WRITER)) {
304 				/*
305 				 * Sloppy, but this is a diagnostic so atomic
306 				 * increment would be overkill.
307 				 */
308 				tmpfs_rename_loops++;
309 
310 				if (tmpfs_rename_backoff_tries != 0) {
311 					if (tries > tmpfs_rename_backoff_tries)
312 						return (EBUSY);
313 					tries++;
314 				}
315 				/*
316 				 * NOTE: We're still holding dir->tn_rwlock,
317 				 * so drop it over the delay, so any other
318 				 * thread can get its business done.
319 				 *
320 				 * No state change or state inspection happens
321 				 * prior to here, so it is not wholly dangerous
322 				 * to release-and-reacquire dir->tn_rwlock.
323 				 *
324 				 * Hold the vnode of dir in case it gets
325 				 * released by another thread, though.
326 				 */
327 				VN_HOLD(TNTOV(dir));
328 				rw_exit(&dir->tn_rwlock);
329 				delay(tmpfs_rename_backoff_delay);
330 				rw_enter(&dir->tn_rwlock, RW_WRITER);
331 				VN_RELE(TNTOV(dir));
332 			}
333 		}
334 		mutex_enter(&tp->tn_tlock);
335 		if (tp->tn_nlink == 0) {
336 			mutex_exit(&tp->tn_tlock);
337 			if (tp != dir)
338 				rw_exit(&tp->tn_rwlock);
339 			return (ENOENT);
340 		}
341 
342 		if (tp->tn_nlink == MAXLINK) {
343 			mutex_exit(&tp->tn_tlock);
344 			if (tp != dir)
345 				rw_exit(&tp->tn_rwlock);
346 			return (EMLINK);
347 		}
348 		tp->tn_nlink++;
349 		gethrestime(&tp->tn_ctime);
350 		mutex_exit(&tp->tn_tlock);
351 		if (tp != dir)
352 			rw_exit(&tp->tn_rwlock);
353 	}
354 
355 	/*
356 	 * This might be a "dangling detached directory".
357 	 * it could have been removed, but a reference
358 	 * to it kept in u_cwd.  don't bother searching
359 	 * it, and with any luck the user will get tired
360 	 * of dealing with us and cd to some absolute
361 	 * pathway.  *sigh*, thus in ufs, too.
362 	 */
363 	if (dir->tn_nlink == 0) {
364 		error = ENOENT;
365 		goto out;
366 	}
367 
368 	/*
369 	 * If this is a rename of a directory and the parent is
370 	 * different (".." must be changed), then the source
371 	 * directory must not be in the directory hierarchy
372 	 * above the target, as this would orphan everything
373 	 * below the source directory.
374 	 */
375 	if (op == DE_RENAME) {
376 		if (tp == dir) {
377 			error = EINVAL;
378 			goto out;
379 		}
380 		if (tp->tn_type == VDIR) {
381 			if ((fromparent != dir) &&
382 			    (error = tdircheckpath(tp, dir, cred))) {
383 				goto out;
384 			}
385 		}
386 	}
387 
388 	/*
389 	 * Search for the entry.  Return "found" if it exists.
390 	 */
391 	tdp = tmpfs_hash_lookup(name, dir, 1, &found);
392 
393 	if (tdp) {
394 		ASSERT(found);
395 		switch (op) {
396 		case DE_CREATE:
397 		case DE_MKDIR:
398 			if (tpp) {
399 				*tpp = found;
400 				error = EEXIST;
401 			} else {
402 				tmpnode_rele(found);
403 			}
404 			break;
405 
406 		case DE_RENAME:
407 			error = tdirrename(fromparent, tp,
408 			    dir, name, found, tdp, cred);
409 			if (error == 0) {
410 				if (found != NULL) {
411 					vnevent_rename_dest(TNTOV(found),
412 					    TNTOV(dir), name, ctp);
413 				}
414 			}
415 
416 			tmpnode_rele(found);
417 			break;
418 
419 		case DE_LINK:
420 			/*
421 			 * Can't link to an existing file.
422 			 */
423 			error = EEXIST;
424 			tmpnode_rele(found);
425 			break;
426 		}
427 	} else {
428 
429 		/*
430 		 * The entry does not exist. Check write permission in
431 		 * directory to see if entry can be created.
432 		 */
433 		if (error = tmp_taccess(dir, VWRITE, cred))
434 			goto out;
435 		if (op == DE_CREATE || op == DE_MKDIR) {
436 			/*
437 			 * Make new tmpnode and directory entry as required.
438 			 */
439 			error = tdirmaketnode(dir, tm, va, op, &tp, cred);
440 			if (error)
441 				goto out;
442 		}
443 		if (error = tdiraddentry(dir, tp, name, op, fromparent)) {
444 			if (op == DE_CREATE || op == DE_MKDIR) {
445 				/*
446 				 * Unmake the inode we just made.
447 				 */
448 				rw_enter(&tp->tn_rwlock, RW_WRITER);
449 				if ((tp->tn_type) == VDIR) {
450 					ASSERT(tdp == NULL);
451 					/*
452 					 * cleanup allocs made by tdirinit()
453 					 */
454 					tdirtrunc(tp);
455 				}
456 				mutex_enter(&tp->tn_tlock);
457 				tp->tn_nlink = 0;
458 				mutex_exit(&tp->tn_tlock);
459 				gethrestime(&tp->tn_ctime);
460 				rw_exit(&tp->tn_rwlock);
461 				tmpnode_rele(tp);
462 				tp = NULL;
463 			}
464 		} else if (tpp) {
465 			*tpp = tp;
466 		} else if (op == DE_CREATE || op == DE_MKDIR) {
467 			tmpnode_rele(tp);
468 		}
469 	}
470 
471 out:
472 	if (error && (op == DE_LINK || op == DE_RENAME)) {
473 		/*
474 		 * Undo bumped link count.
475 		 */
476 		DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
477 		gethrestime(&tp->tn_ctime);
478 	}
479 	return (error);
480 }
481 
482 /*
483  * Delete entry tp of name "nm" from dir.
484  * Free dir entry space and decrement link count on tmpnode(s).
485  *
486  * Return 0 on success.
487  */
488 int
489 tdirdelete(
490 	struct tmpnode *dir,
491 	struct tmpnode *tp,
492 	char *nm,
493 	enum dr_op op,
494 	struct cred *cred)
495 {
496 	struct tdirent *tpdp;
497 	int error;
498 	size_t namelen;
499 	struct tmpnode *tnp;
500 	timestruc_t now;
501 
502 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
503 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
504 	ASSERT(dir->tn_type == VDIR);
505 
506 	if (nm[0] == '\0')
507 		panic("tdirdelete: NULL name for %p", (void *)tp);
508 
509 	/*
510 	 * return error when removing . and ..
511 	 */
512 	if (nm[0] == '.') {
513 		if (nm[1] == '\0')
514 			return (EINVAL);
515 		if (nm[1] == '.' && nm[2] == '\0')
516 			return (EEXIST); /* thus in ufs */
517 	}
518 
519 	if (error = tmp_taccess(dir, VEXEC|VWRITE, cred))
520 		return (error);
521 
522 	/*
523 	 * If the parent directory is "sticky", then the user must
524 	 * own the parent directory or the file in it, or else must
525 	 * have permission to write the file.  Otherwise it may not
526 	 * be deleted (except by privileged users).
527 	 * Same as ufs_dirremove.
528 	 */
529 	if ((error = tmp_sticky_remove_access(dir, tp, cred)) != 0)
530 		return (error);
531 
532 	if (dir->tn_dir == NULL)
533 		return (ENOENT);
534 
535 	tpdp = tmpfs_hash_lookup(nm, dir, 0, &tnp);
536 	if (tpdp == NULL) {
537 		/*
538 		 * If it is gone, some other thread got here first!
539 		 * Return error ENOENT.
540 		 */
541 		return (ENOENT);
542 	}
543 
544 	/*
545 	 * If the tmpnode in the tdirent changed, we were probably
546 	 * the victim of a concurrent rename operation.  The original
547 	 * is gone, so return that status (same as UFS).
548 	 */
549 	if (tp != tnp)
550 		return (ENOENT);
551 
552 	tmpfs_hash_out(tpdp);
553 
554 	/*
555 	 * Take tpdp out of the directory list.
556 	 */
557 	ASSERT(tpdp->td_next != tpdp);
558 	ASSERT(tpdp->td_prev != tpdp);
559 	if (tpdp->td_prev) {
560 		tpdp->td_prev->td_next = tpdp->td_next;
561 	}
562 	if (tpdp->td_next) {
563 		tpdp->td_next->td_prev = tpdp->td_prev;
564 	}
565 
566 	/*
567 	 * If the roving slot pointer happens to match tpdp,
568 	 * point it at the previous dirent.
569 	 */
570 	if (dir->tn_dir->td_prev == tpdp) {
571 		dir->tn_dir->td_prev = tpdp->td_prev;
572 	}
573 	ASSERT(tpdp->td_next != tpdp);
574 	ASSERT(tpdp->td_prev != tpdp);
575 
576 	/*
577 	 * tpdp points to the correct directory entry
578 	 */
579 	namelen = strlen(tpdp->td_name) + 1;
580 
581 	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
582 	dir->tn_size -= (sizeof (struct tdirent) + namelen);
583 	dir->tn_dirents--;
584 
585 	gethrestime(&now);
586 	dir->tn_mtime = now;
587 	dir->tn_ctime = now;
588 	tp->tn_ctime = now;
589 
590 	ASSERT(tp->tn_nlink > 0);
591 	DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
592 	if (op == DR_RMDIR && tp->tn_type == VDIR) {
593 		tdirtrunc(tp);
594 		ASSERT(tp->tn_nlink == 0);
595 	}
596 	return (0);
597 }
598 
599 /*
600  * tdirinit is used internally to initialize a directory (dir)
601  * with '.' and '..' entries without checking permissions and locking
602  */
603 void
604 tdirinit(
605 	struct tmpnode *parent,		/* parent of directory to initialize */
606 	struct tmpnode *dir)		/* the new directory */
607 {
608 	struct tdirent *dot, *dotdot;
609 	timestruc_t now;
610 
611 	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
612 	ASSERT(dir->tn_type == VDIR);
613 
614 	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
615 	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
616 
617 	/*
618 	 * Initialize the entries
619 	 */
620 	dot->td_tmpnode = dir;
621 	dot->td_offset = 0;
622 	dot->td_name = (char *)dot + sizeof (struct tdirent);
623 	dot->td_name[0] = '.';
624 	dot->td_parent = dir;
625 	tmpfs_hash_in(dot);
626 
627 	dotdot->td_tmpnode = parent;
628 	dotdot->td_offset = 1;
629 	dotdot->td_name = (char *)dotdot + sizeof (struct tdirent);
630 	dotdot->td_name[0] = '.';
631 	dotdot->td_name[1] = '.';
632 	dotdot->td_parent = dir;
633 	tmpfs_hash_in(dotdot);
634 
635 	/*
636 	 * Initialize directory entry list.
637 	 */
638 	dot->td_next = dotdot;
639 	dot->td_prev = dotdot;	/* dot's td_prev holds roving slot pointer */
640 	dotdot->td_next = NULL;
641 	dotdot->td_prev = dot;
642 
643 	gethrestime(&now);
644 	dir->tn_mtime = now;
645 	dir->tn_ctime = now;
646 
647 	/*
648 	 * Link counts are special for the hidden attribute directory.
649 	 * The only explicit reference in the name space is "." and
650 	 * the reference through ".." is not counted on the parent
651 	 * file. The attrdir is created as a side effect to lookup,
652 	 * so don't change the ctime of the parent.
653 	 * Since tdirinit is called with both dir and parent being the
654 	 * same for the root vnode, we need to increment this before we set
655 	 * tn_nlink = 2 below.
656 	 */
657 	if (!(dir->tn_vnode->v_flag & V_XATTRDIR)) {
658 		INCR_COUNT(&parent->tn_nlink, &parent->tn_tlock);
659 		parent->tn_ctime = now;
660 	}
661 
662 	dir->tn_dir = dot;
663 	dir->tn_size = 2 * sizeof (struct tdirent) + 5;	/* dot and dotdot */
664 	dir->tn_dirents = 2;
665 	dir->tn_nlink = 2;
666 }
667 
668 
669 /*
670  * tdirtrunc is called to remove all directory entries under this directory.
671  */
672 void
673 tdirtrunc(struct tmpnode *dir)
674 {
675 	struct tdirent *tdp;
676 	struct tmpnode *tp;
677 	size_t namelen;
678 	timestruc_t now;
679 	int isvattrdir, isdotdot, skip_decr;
680 
681 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
682 	ASSERT(dir->tn_type == VDIR);
683 
684 	isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
685 	for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
686 		ASSERT(tdp->td_next != tdp);
687 		ASSERT(tdp->td_prev != tdp);
688 		ASSERT(tdp->td_tmpnode);
689 
690 		dir->tn_dir = tdp->td_next;
691 		namelen = strlen(tdp->td_name) + 1;
692 
693 		/*
694 		 * Adjust the link counts to account for this directory
695 		 * entry removal. Hidden attribute directories may
696 		 * not be empty as they may be truncated as a side-
697 		 * effect of removing the parent. We do hold/rele
698 		 * operations to free up these tmpnodes.
699 		 *
700 		 * Skip the link count adjustment for parents of
701 		 * attribute directories as those link counts
702 		 * do not include the ".." reference in the hidden
703 		 * directories.
704 		 */
705 		tp = tdp->td_tmpnode;
706 		isdotdot = (strcmp("..", tdp->td_name) == 0);
707 		skip_decr = (isvattrdir && isdotdot);
708 		if (!skip_decr) {
709 			ASSERT(tp->tn_nlink > 0);
710 			DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
711 		}
712 
713 		tmpfs_hash_out(tdp);
714 
715 		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
716 		dir->tn_size -= (sizeof (struct tdirent) + namelen);
717 		dir->tn_dirents--;
718 	}
719 
720 	gethrestime(&now);
721 	dir->tn_mtime = now;
722 	dir->tn_ctime = now;
723 
724 	ASSERT(dir->tn_dir == NULL);
725 	ASSERT(dir->tn_size == 0);
726 	ASSERT(dir->tn_dirents == 0);
727 }
728 
729 /*
730  * Check if the source directory is in the path of the target directory.
731  * The target directory is locked by the caller.
732  *
733  * XXX - The source and target's should be different upon entry.
734  */
735 static int
736 tdircheckpath(
737 	struct tmpnode *fromtp,
738 	struct tmpnode	*toparent,
739 	struct cred	*cred)
740 {
741 	int	error = 0;
742 	struct tmpnode *dir, *dotdot;
743 	struct tdirent *tdp;
744 
745 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
746 
747 	tdp = tmpfs_hash_lookup("..", toparent, 1, &dotdot);
748 	if (tdp == NULL)
749 		return (ENOENT);
750 
751 	ASSERT(dotdot);
752 
753 	if (dotdot == toparent) {
754 		/* root of fs.  search trivially satisfied. */
755 		tmpnode_rele(dotdot);
756 		return (0);
757 	}
758 	for (;;) {
759 		/*
760 		 * Return error for cases like "mv c c/d",
761 		 * "mv c c/d/e" and so on.
762 		 */
763 		if (dotdot == fromtp) {
764 			tmpnode_rele(dotdot);
765 			error = EINVAL;
766 			break;
767 		}
768 		dir = dotdot;
769 		error = tdirlookup(dir, "..", &dotdot, cred);
770 		if (error) {
771 			tmpnode_rele(dir);
772 			break;
773 		}
774 		/*
775 		 * We're okay if we traverse the directory tree up to
776 		 * the root directory and don't run into the
777 		 * parent directory.
778 		 */
779 		if (dir == dotdot) {
780 			tmpnode_rele(dir);
781 			tmpnode_rele(dotdot);
782 			break;
783 		}
784 		tmpnode_rele(dir);
785 	}
786 	return (error);
787 }
788 
789 static int
790 tdirrename(
791 	struct tmpnode *fromparent,	/* parent directory of source */
792 	struct tmpnode *fromtp,		/* source tmpnode */
793 	struct tmpnode *toparent,	/* parent directory of target */
794 	char *nm,			/* entry we are trying to change */
795 	struct tmpnode *to,		/* target tmpnode */
796 	struct tdirent *where,		/* target tmpnode directory entry */
797 	struct cred *cred)		/* credentials */
798 {
799 	int error = 0;
800 	int doingdirectory;
801 	timestruc_t now;
802 
803 #if defined(lint)
804 	nm = nm;
805 #endif
806 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
807 
808 	/*
809 	 * Short circuit rename of something to itself.
810 	 */
811 	if (fromtp == to)
812 		return (ESAME);		/* special KLUDGE error code */
813 
814 	rw_enter(&fromtp->tn_rwlock, RW_READER);
815 	rw_enter(&to->tn_rwlock, RW_READER);
816 
817 	/*
818 	 * Check that everything is on the same filesystem.
819 	 */
820 	if (to->tn_vnode->v_vfsp != toparent->tn_vnode->v_vfsp ||
821 	    to->tn_vnode->v_vfsp != fromtp->tn_vnode->v_vfsp) {
822 		error = EXDEV;
823 		goto out;
824 	}
825 
826 	/*
827 	 * Must have write permission to rewrite target entry.
828 	 * Check for stickyness.
829 	 */
830 	if ((error = tmp_taccess(toparent, VWRITE, cred)) != 0 ||
831 	    (error = tmp_sticky_remove_access(toparent, to, cred)) != 0)
832 		goto out;
833 
834 	/*
835 	 * Ensure source and target are compatible (both directories
836 	 * or both not directories).  If target is a directory it must
837 	 * be empty and have no links to it; in addition it must not
838 	 * be a mount point, and both the source and target must be
839 	 * writable.
840 	 */
841 	doingdirectory = (fromtp->tn_type == VDIR);
842 	if (to->tn_type == VDIR) {
843 		if (!doingdirectory) {
844 			error = EISDIR;
845 			goto out;
846 		}
847 		/*
848 		 * vn_vfswlock will prevent mounts from using the directory
849 		 * until we are done.
850 		 */
851 		if (vn_vfswlock(TNTOV(to))) {
852 			error = EBUSY;
853 			goto out;
854 		}
855 		if (vn_mountedvfs(TNTOV(to)) != NULL) {
856 			vn_vfsunlock(TNTOV(to));
857 			error = EBUSY;
858 			goto out;
859 		}
860 
861 		mutex_enter(&to->tn_tlock);
862 		if (to->tn_dirents > 2 || to->tn_nlink > 2) {
863 			mutex_exit(&to->tn_tlock);
864 			vn_vfsunlock(TNTOV(to));
865 			error = EEXIST; /* SIGH should be ENOTEMPTY */
866 			/*
867 			 * Update atime because checking tn_dirents is
868 			 * logically equivalent to reading the directory
869 			 */
870 			gethrestime(&to->tn_atime);
871 			goto out;
872 		}
873 		mutex_exit(&to->tn_tlock);
874 	} else if (doingdirectory) {
875 		error = ENOTDIR;
876 		goto out;
877 	}
878 
879 	tmpfs_hash_change(where, fromtp);
880 	gethrestime(&now);
881 	toparent->tn_mtime = now;
882 	toparent->tn_ctime = now;
883 
884 	/*
885 	 * Upgrade to write lock on "to" (i.e., the target tmpnode).
886 	 */
887 	rw_exit(&to->tn_rwlock);
888 	rw_enter(&to->tn_rwlock, RW_WRITER);
889 
890 	/*
891 	 * Decrement the link count of the target tmpnode.
892 	 */
893 	DECR_COUNT(&to->tn_nlink, &to->tn_tlock);
894 	to->tn_ctime = now;
895 
896 	if (doingdirectory) {
897 		/*
898 		 * The entry for "to" no longer exists so release the vfslock.
899 		 */
900 		vn_vfsunlock(TNTOV(to));
901 
902 		/*
903 		 * Decrement the target link count and delete all entires.
904 		 */
905 		tdirtrunc(to);
906 		ASSERT(to->tn_nlink == 0);
907 
908 		/*
909 		 * Renaming a directory with the parent different
910 		 * requires that ".." be rewritten.  The window is
911 		 * still there for ".." to be inconsistent, but this
912 		 * is unavoidable, and a lot shorter than when it was
913 		 * done in a user process.
914 		 */
915 		if (fromparent != toparent)
916 			tdirfixdotdot(fromtp, fromparent, toparent);
917 	}
918 out:
919 	rw_exit(&to->tn_rwlock);
920 	rw_exit(&fromtp->tn_rwlock);
921 	return (error);
922 }
923 
924 static void
925 tdirfixdotdot(
926 	struct tmpnode	*fromtp,	/* child directory */
927 	struct tmpnode	*fromparent,	/* old parent directory */
928 	struct tmpnode	*toparent)	/* new parent directory */
929 {
930 	struct tdirent	*dotdot;
931 
932 	ASSERT(RW_LOCK_HELD(&toparent->tn_rwlock));
933 
934 	/*
935 	 * Increment the link count in the new parent tmpnode
936 	 */
937 	INCR_COUNT(&toparent->tn_nlink, &toparent->tn_tlock);
938 	gethrestime(&toparent->tn_ctime);
939 
940 	dotdot = tmpfs_hash_lookup("..", fromtp, 0, NULL);
941 
942 	ASSERT(dotdot->td_tmpnode == fromparent);
943 	dotdot->td_tmpnode = toparent;
944 
945 	/*
946 	 * Decrement the link count of the old parent tmpnode.
947 	 * If fromparent is NULL, then this is a new directory link;
948 	 * it has no parent, so we need not do anything.
949 	 */
950 	if (fromparent != NULL) {
951 		mutex_enter(&fromparent->tn_tlock);
952 		if (fromparent->tn_nlink != 0) {
953 			fromparent->tn_nlink--;
954 			gethrestime(&fromparent->tn_ctime);
955 		}
956 		mutex_exit(&fromparent->tn_tlock);
957 	}
958 }
959 
960 static int
961 tdiraddentry(
962 	struct tmpnode	*dir,	/* target directory to make entry in */
963 	struct tmpnode	*tp,	/* new tmpnode */
964 	char		*name,
965 	enum de_op	op,
966 	struct tmpnode	*fromtp)
967 {
968 	struct tdirent *tdp, *tpdp;
969 	size_t		namelen, alloc_size;
970 	timestruc_t	now;
971 
972 	/*
973 	 * Make sure the parent directory wasn't removed from
974 	 * underneath the caller.
975 	 */
976 	if (dir->tn_dir == NULL)
977 		return (ENOENT);
978 
979 	/*
980 	 * Check that everything is on the same filesystem.
981 	 */
982 	if (tp->tn_vnode->v_vfsp != dir->tn_vnode->v_vfsp)
983 		return (EXDEV);
984 
985 	/*
986 	 * Allocate and initialize directory entry
987 	 */
988 	namelen = strlen(name) + 1;
989 	alloc_size = namelen + sizeof (struct tdirent);
990 	tdp = tmp_memalloc(alloc_size, 0);
991 	if (tdp == NULL)
992 		return (ENOSPC);
993 
994 	if ((op == DE_RENAME) && (tp->tn_type == VDIR))
995 		tdirfixdotdot(tp, fromtp, dir);
996 
997 	dir->tn_size += alloc_size;
998 	dir->tn_dirents++;
999 	tdp->td_tmpnode = tp;
1000 	tdp->td_parent = dir;
1001 
1002 	/*
1003 	 * The directory entry and its name were allocated sequentially.
1004 	 */
1005 	tdp->td_name = (char *)tdp + sizeof (struct tdirent);
1006 	(void) strcpy(tdp->td_name, name);
1007 
1008 	tmpfs_hash_in(tdp);
1009 
1010 	/*
1011 	 * Some utilities expect the size of a directory to remain
1012 	 * somewhat static.  For example, a routine which unlinks
1013 	 * files between calls to readdir(); the size of the
1014 	 * directory changes from underneath it and so the real
1015 	 * directory offset in bytes is invalid.  To circumvent
1016 	 * this problem, we initialize a directory entry with an
1017 	 * phony offset, and use this offset to determine end of
1018 	 * file in tmp_readdir.
1019 	 */
1020 	tpdp = dir->tn_dir->td_prev;
1021 	/*
1022 	 * Install at first empty "slot" in directory list.
1023 	 */
1024 	while (tpdp->td_next != NULL && (tpdp->td_next->td_offset -
1025 	    tpdp->td_offset) <= 1) {
1026 		ASSERT(tpdp->td_next != tpdp);
1027 		ASSERT(tpdp->td_prev != tpdp);
1028 		ASSERT(tpdp->td_next->td_offset > tpdp->td_offset);
1029 		tpdp = tpdp->td_next;
1030 	}
1031 	tdp->td_offset = tpdp->td_offset + 1;
1032 
1033 	/*
1034 	 * If we're at the end of the dirent list and the offset (which
1035 	 * is necessarily the largest offset in this directory) is more
1036 	 * than twice the number of dirents, that means the directory is
1037 	 * 50% holes.  At this point we reset the slot pointer back to
1038 	 * the beginning of the directory so we start using the holes.
1039 	 * The idea is that if there are N dirents, there must also be
1040 	 * N holes, so we can satisfy the next N creates by walking at
1041 	 * most 2N entries; thus the average cost of a create is constant.
1042 	 * Note that we use the first dirent's td_prev as the roving
1043 	 * slot pointer; it's ugly, but it saves a word in every dirent.
1044 	 */
1045 	if (tpdp->td_next == NULL && tpdp->td_offset > 2 * dir->tn_dirents)
1046 		dir->tn_dir->td_prev = dir->tn_dir->td_next;
1047 	else
1048 		dir->tn_dir->td_prev = tdp;
1049 
1050 	ASSERT(tpdp->td_next != tpdp);
1051 	ASSERT(tpdp->td_prev != tpdp);
1052 
1053 	tdp->td_next = tpdp->td_next;
1054 	if (tdp->td_next) {
1055 		tdp->td_next->td_prev = tdp;
1056 	}
1057 	tdp->td_prev = tpdp;
1058 	tpdp->td_next = tdp;
1059 
1060 	ASSERT(tdp->td_next != tdp);
1061 	ASSERT(tdp->td_prev != tdp);
1062 	ASSERT(tpdp->td_next != tpdp);
1063 	ASSERT(tpdp->td_prev != tpdp);
1064 
1065 	gethrestime(&now);
1066 	dir->tn_mtime = now;
1067 	dir->tn_ctime = now;
1068 
1069 	return (0);
1070 }
1071 
1072 static int
1073 tdirmaketnode(
1074 	struct tmpnode *dir,
1075 	struct tmount	*tm,
1076 	struct vattr	*va,
1077 	enum	de_op	op,
1078 	struct tmpnode **newnode,
1079 	struct cred	*cred)
1080 {
1081 	struct tmpnode *tp;
1082 	enum vtype	type;
1083 
1084 	ASSERT(va != NULL);
1085 	ASSERT(op == DE_CREATE || op == DE_MKDIR);
1086 	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
1087 	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
1088 		return (EOVERFLOW);
1089 	type = va->va_type;
1090 	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
1091 	tmpnode_init(tm, tp, va, cred);
1092 
1093 	/* setup normal file/dir's extended attribute directory */
1094 	if (dir->tn_flags & ISXATTR) {
1095 		/* parent dir is , mark file as xattr */
1096 		tp->tn_flags |= ISXATTR;
1097 	}
1098 
1099 
1100 	if (type == VBLK || type == VCHR) {
1101 		tp->tn_vnode->v_rdev = tp->tn_rdev = va->va_rdev;
1102 	} else {
1103 		tp->tn_vnode->v_rdev = tp->tn_rdev = NODEV;
1104 	}
1105 	tp->tn_vnode->v_type = type;
1106 	tp->tn_uid = crgetuid(cred);
1107 
1108 	/*
1109 	 * To determine the group-id of the created file:
1110 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
1111 	 *	clients are not likely to set the gid), then use it if
1112 	 *	the process is privileged, belongs to the target group,
1113 	 *	or the group is the same as the parent directory.
1114 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
1115 	 *	GRPID option, and the directory's set-gid bit is clear,
1116 	 *	then use the process's gid.
1117 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
1118 	 */
1119 	if ((va->va_mask & AT_GID) &&
1120 	    ((va->va_gid == dir->tn_gid) || groupmember(va->va_gid, cred) ||
1121 	    secpolicy_vnode_create_gid(cred) == 0)) {
1122 		/*
1123 		 * XXX - is this only the case when a 4.0 NFS client, or a
1124 		 * client derived from that code, makes a call over the wire?
1125 		 */
1126 		tp->tn_gid = va->va_gid;
1127 	} else {
1128 		if (dir->tn_mode & VSGID)
1129 			tp->tn_gid = dir->tn_gid;
1130 		else
1131 			tp->tn_gid = crgetgid(cred);
1132 	}
1133 	/*
1134 	 * If we're creating a directory, and the parent directory has the
1135 	 * set-GID bit set, set it on the new directory.
1136 	 * Otherwise, if the user is neither privileged nor a member of the
1137 	 * file's new group, clear the file's set-GID bit.
1138 	 */
1139 	if (dir->tn_mode & VSGID && type == VDIR)
1140 		tp->tn_mode |= VSGID;
1141 	else {
1142 		if ((tp->tn_mode & VSGID) &&
1143 		    secpolicy_vnode_setids_setgids(cred, tp->tn_gid) != 0)
1144 			tp->tn_mode &= ~VSGID;
1145 	}
1146 
1147 	if (va->va_mask & AT_ATIME)
1148 		tp->tn_atime = va->va_atime;
1149 	if (va->va_mask & AT_MTIME)
1150 		tp->tn_mtime = va->va_mtime;
1151 
1152 	if (op == DE_MKDIR)
1153 		tdirinit(dir, tp);
1154 
1155 	*newnode = tp;
1156 	return (0);
1157 }
1158