1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2020 Joyent, Inc. 27 * Copyright 2020 Tintri by DDN, Inc. All rights reserved. 28 * Copyright 2015-2023 RackTop Systems, Inc. 29 */ 30 31 /* Portions Copyright 2007 Jeremy Teo */ 32 /* Portions Copyright 2010 Robert Milkowski */ 33 34 #include <sys/types.h> 35 #include <sys/param.h> 36 #include <sys/time.h> 37 #include <sys/systm.h> 38 #include <sys/sysmacros.h> 39 #include <sys/resource.h> 40 #include <sys/vfs.h> 41 #include <sys/vfs_opreg.h> 42 #include <sys/vnode.h> 43 #include <sys/file.h> 44 #include <sys/stat.h> 45 #include <sys/kmem.h> 46 #include <sys/taskq.h> 47 #include <sys/uio.h> 48 #include <sys/vmsystm.h> 49 #include <sys/atomic.h> 50 #include <sys/vm.h> 51 #include <vm/seg_vn.h> 52 #include <vm/pvn.h> 53 #include <vm/as.h> 54 #include <vm/kpm.h> 55 #include <vm/seg_kpm.h> 56 #include <sys/mman.h> 57 #include <sys/pathname.h> 58 #include <sys/cmn_err.h> 59 #include <sys/errno.h> 60 #include <sys/unistd.h> 61 #include <sys/zfs_dir.h> 62 #include <sys/zfs_acl.h> 63 #include <sys/zfs_ioctl.h> 64 #include <sys/fs/zfs.h> 65 #include <sys/dmu.h> 66 #include <sys/dmu_objset.h> 67 #include <sys/spa.h> 68 #include <sys/txg.h> 69 #include <sys/dbuf.h> 70 #include <sys/zap.h> 71 #include <sys/sa.h> 72 #include <sys/dirent.h> 73 #include <sys/policy.h> 74 #include <sys/sunddi.h> 75 #include <sys/filio.h> 76 #include <sys/sid.h> 77 #include "fs/fs_subr.h" 78 #include <sys/zfs_ctldir.h> 79 #include <sys/zfs_fuid.h> 80 #include <sys/zfs_sa.h> 81 #include <sys/dnlc.h> 82 #include <sys/zfs_rlock.h> 83 #include <sys/extdirent.h> 84 #include <sys/kidmap.h> 85 #include <sys/cred.h> 86 #include <sys/attr.h> 87 #include <sys/zil.h> 88 #include <sys/sa_impl.h> 89 #include <sys/zfs_project.h> 90 91 /* 92 * Programming rules. 93 * 94 * Each vnode op performs some logical unit of work. To do this, the ZPL must 95 * properly lock its in-core state, create a DMU transaction, do the work, 96 * record this work in the intent log (ZIL), commit the DMU transaction, 97 * and wait for the intent log to commit if it is a synchronous operation. 98 * Moreover, the vnode ops must work in both normal and log replay context. 99 * The ordering of events is important to avoid deadlocks and references 100 * to freed memory. The example below illustrates the following Big Rules: 101 * 102 * (1) A check must be made in each zfs thread for a mounted file system. 103 * This is done avoiding races using ZFS_ENTER(zfsvfs). 104 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 105 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 106 * can return EIO from the calling function. 107 * 108 * (2) VN_RELE() should always be the last thing except for zil_commit() 109 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 110 * First, if it's the last reference, the vnode/znode 111 * can be freed, so the zp may point to freed memory. Second, the last 112 * reference will call zfs_zinactive(), which may induce a lot of work -- 113 * pushing cached pages (which acquires range locks) and syncing out 114 * cached atime changes. Third, zfs_zinactive() may require a new tx, 115 * which could deadlock the system if you were already holding one. 116 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 117 * 118 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 119 * as they can span dmu_tx_assign() calls. 120 * 121 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 122 * dmu_tx_assign(). This is critical because we don't want to block 123 * while holding locks. 124 * 125 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 126 * reduces lock contention and CPU usage when we must wait (note that if 127 * throughput is constrained by the storage, nearly every transaction 128 * must wait). 129 * 130 * Note, in particular, that if a lock is sometimes acquired before 131 * the tx assigns, and sometimes after (e.g. z_lock), then failing 132 * to use a non-blocking assign can deadlock the system. The scenario: 133 * 134 * Thread A has grabbed a lock before calling dmu_tx_assign(). 135 * Thread B is in an already-assigned tx, and blocks for this lock. 136 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 137 * forever, because the previous txg can't quiesce until B's tx commits. 138 * 139 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 140 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 141 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 142 * to indicate that this operation has already called dmu_tx_wait(). 143 * This will ensure that we don't retry forever, waiting a short bit 144 * each time. 145 * 146 * (5) If the operation succeeded, generate the intent log entry for it 147 * before dropping locks. This ensures that the ordering of events 148 * in the intent log matches the order in which they actually occurred. 149 * During ZIL replay the zfs_log_* functions will update the sequence 150 * number to indicate the zil transaction has replayed. 151 * 152 * (6) At the end of each vnode op, the DMU tx must always commit, 153 * regardless of whether there were any errors. 154 * 155 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 156 * to ensure that synchronous semantics are provided when necessary. 157 * 158 * In general, this is how things should be ordered in each vnode op: 159 * 160 * ZFS_ENTER(zfsvfs); // exit if unmounted 161 * top: 162 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 163 * rw_enter(...); // grab any other locks you need 164 * tx = dmu_tx_create(...); // get DMU tx 165 * dmu_tx_hold_*(); // hold each object you might modify 166 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 167 * if (error) { 168 * rw_exit(...); // drop locks 169 * zfs_dirent_unlock(dl); // unlock directory entry 170 * VN_RELE(...); // release held vnodes 171 * if (error == ERESTART) { 172 * waited = B_TRUE; 173 * dmu_tx_wait(tx); 174 * dmu_tx_abort(tx); 175 * goto top; 176 * } 177 * dmu_tx_abort(tx); // abort DMU tx 178 * ZFS_EXIT(zfsvfs); // finished in zfs 179 * return (error); // really out of space 180 * } 181 * error = do_real_work(); // do whatever this VOP does 182 * if (error == 0) 183 * zfs_log_*(...); // on success, make ZIL entry 184 * dmu_tx_commit(tx); // commit DMU tx -- error or not 185 * rw_exit(...); // drop locks 186 * zfs_dirent_unlock(dl); // unlock directory entry 187 * VN_RELE(...); // release held vnodes 188 * zil_commit(zilog, foid); // synchronous when necessary 189 * ZFS_EXIT(zfsvfs); // finished in zfs 190 * return (error); // done, report error 191 */ 192 193 /* ARGSUSED */ 194 static int 195 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 196 { 197 znode_t *zp = VTOZ(*vpp); 198 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 199 200 ZFS_ENTER(zfsvfs); 201 ZFS_VERIFY_ZP(zp); 202 203 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 204 ((flag & FAPPEND) == 0)) { 205 ZFS_EXIT(zfsvfs); 206 return (SET_ERROR(EPERM)); 207 } 208 209 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 210 ZTOV(zp)->v_type == VREG && 211 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 212 if (fs_vscan(*vpp, cr, 0) != 0) { 213 ZFS_EXIT(zfsvfs); 214 return (SET_ERROR(EACCES)); 215 } 216 } 217 218 /* Keep a count of the synchronous opens in the znode */ 219 if (flag & (FSYNC | FDSYNC)) 220 atomic_inc_32(&zp->z_sync_cnt); 221 222 ZFS_EXIT(zfsvfs); 223 return (0); 224 } 225 226 /* ARGSUSED */ 227 static int 228 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 229 caller_context_t *ct) 230 { 231 znode_t *zp = VTOZ(vp); 232 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 233 234 /* 235 * Clean up any locks held by this process on the vp. 236 */ 237 cleanlocks(vp, ddi_get_pid(), 0); 238 cleanshares(vp, ddi_get_pid()); 239 240 ZFS_ENTER(zfsvfs); 241 ZFS_VERIFY_ZP(zp); 242 243 /* Decrement the synchronous opens in the znode */ 244 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 245 atomic_dec_32(&zp->z_sync_cnt); 246 247 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 248 ZTOV(zp)->v_type == VREG && 249 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 250 VERIFY(fs_vscan(vp, cr, 1) == 0); 251 252 ZFS_EXIT(zfsvfs); 253 return (0); 254 } 255 256 /* 257 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 258 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 259 */ 260 static int 261 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 262 { 263 znode_t *zp = VTOZ(vp); 264 uint64_t noff = (uint64_t)*off; /* new offset */ 265 uint64_t file_sz; 266 int error; 267 boolean_t hole; 268 269 file_sz = zp->z_size; 270 if (noff >= file_sz) { 271 return (SET_ERROR(ENXIO)); 272 } 273 274 if (cmd == _FIO_SEEK_HOLE) 275 hole = B_TRUE; 276 else 277 hole = B_FALSE; 278 279 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 280 281 if (error == ESRCH) 282 return (SET_ERROR(ENXIO)); 283 284 /* 285 * We could find a hole that begins after the logical end-of-file, 286 * because dmu_offset_next() only works on whole blocks. If the 287 * EOF falls mid-block, then indicate that the "virtual hole" 288 * at the end of the file begins at the logical EOF, rather than 289 * at the end of the last block. 290 */ 291 if (noff > file_sz) { 292 ASSERT(hole); 293 noff = file_sz; 294 } 295 296 if (noff < *off) 297 return (error); 298 *off = noff; 299 return (error); 300 } 301 302 static int 303 zfs_ioctl_getxattr(vnode_t *vp, intptr_t data, int flag, cred_t *cr, 304 caller_context_t *ct) 305 { 306 zfsxattr_t fsx = { 0 }; 307 znode_t *zp = VTOZ(vp); 308 309 if (zp->z_pflags & ZFS_PROJINHERIT) 310 fsx.fsx_xflags = ZFS_PROJINHERIT_FL; 311 if (zp->z_pflags & ZFS_PROJID) 312 fsx.fsx_projid = zp->z_projid; 313 if (ddi_copyout(&fsx, (void *)data, sizeof (fsx), flag)) 314 return (SET_ERROR(EFAULT)); 315 316 return (0); 317 } 318 319 static int zfs_setattr(vnode_t *, vattr_t *, int, cred_t *, caller_context_t *); 320 321 static int 322 zfs_ioctl_setxattr(vnode_t *vp, intptr_t data, int flags, cred_t *cr, 323 caller_context_t *ct) 324 { 325 znode_t *zp = VTOZ(vp); 326 zfsxattr_t fsx; 327 xvattr_t xva; 328 xoptattr_t *xoap; 329 int err; 330 331 if (ddi_copyin((void *)data, &fsx, sizeof (fsx), flags)) 332 return (SET_ERROR(EFAULT)); 333 334 if (!zpl_is_valid_projid(fsx.fsx_projid)) 335 return (SET_ERROR(EINVAL)); 336 337 if (fsx.fsx_xflags & ~ZFS_PROJINHERIT_FL) 338 return (SET_ERROR(EOPNOTSUPP)); 339 340 xva_init(&xva); 341 xoap = xva_getxoptattr(&xva); 342 343 XVA_SET_REQ(&xva, XAT_PROJINHERIT); 344 if (fsx.fsx_xflags & ZFS_PROJINHERIT_FL) 345 xoap->xoa_projinherit = B_TRUE; 346 347 XVA_SET_REQ(&xva, XAT_PROJID); 348 xoap->xoa_projid = fsx.fsx_projid; 349 350 return (zfs_setattr(vp, (vattr_t *)&xva, flags, cr, ct)); 351 } 352 353 /* ARGSUSED */ 354 static int 355 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 356 int *rvalp, caller_context_t *ct) 357 { 358 offset_t off; 359 offset_t ndata; 360 dmu_object_info_t doi; 361 int error; 362 zfsvfs_t *zfsvfs; 363 znode_t *zp; 364 365 switch (com) { 366 case _FIOFFS: 367 { 368 return (zfs_sync(vp->v_vfsp, 0, cred)); 369 370 /* 371 * The following two ioctls are used by bfu. Faking out, 372 * necessary to avoid bfu errors. 373 */ 374 } 375 case _FIOGDIO: 376 case _FIOSDIO: 377 { 378 return (0); 379 } 380 381 case _FIODIRECTIO: 382 { 383 /* 384 * ZFS inherently provides the basic semantics for directio. 385 * This is the summary from the ZFS on Linux support for 386 * O_DIRECT, which is the common form of directio, and required 387 * no changes to ZFS. 388 * 389 * 1. Minimize cache effects of the I/O. 390 * 391 * By design the ARC is already scan-resistant, which helps 392 * mitigate the need for special O_DIRECT handling. 393 * 394 * 2. O_DIRECT _MAY_ impose restrictions on IO alignment and 395 * length. 396 * 397 * No additional alignment or length restrictions are 398 * imposed by ZFS. 399 * 400 * 3. O_DIRECT _MAY_ perform unbuffered IO operations directly 401 * between user memory and block device. 402 * 403 * No unbuffered IO operations are currently supported. In 404 * order to support features such as compression, encryption, 405 * and checksumming a copy must be made to transform the 406 * data. 407 * 408 * 4. O_DIRECT _MAY_ imply O_DSYNC (XFS). 409 * 410 * O_DIRECT does not imply O_DSYNC for ZFS. 411 * 412 * 5. O_DIRECT _MAY_ disable file locking that serializes IO 413 * operations. 414 * 415 * All I/O in ZFS is locked for correctness and this locking 416 * is not disabled by O_DIRECT. 417 */ 418 return (0); 419 } 420 421 case _FIO_SEEK_DATA: 422 case _FIO_SEEK_HOLE: 423 { 424 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 425 return (SET_ERROR(EFAULT)); 426 427 zp = VTOZ(vp); 428 zfsvfs = zp->z_zfsvfs; 429 ZFS_ENTER(zfsvfs); 430 ZFS_VERIFY_ZP(zp); 431 432 /* offset parameter is in/out */ 433 error = zfs_holey(vp, com, &off); 434 ZFS_EXIT(zfsvfs); 435 if (error) 436 return (error); 437 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 438 return (SET_ERROR(EFAULT)); 439 return (0); 440 } 441 case _FIO_COUNT_FILLED: 442 { 443 /* 444 * _FIO_COUNT_FILLED adds a new ioctl command which 445 * exposes the number of filled blocks in a 446 * ZFS object. 447 */ 448 zp = VTOZ(vp); 449 zfsvfs = zp->z_zfsvfs; 450 ZFS_ENTER(zfsvfs); 451 ZFS_VERIFY_ZP(zp); 452 453 /* 454 * Wait for all dirty blocks for this object 455 * to get synced out to disk, and the DMU info 456 * updated. 457 */ 458 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 459 if (error) { 460 ZFS_EXIT(zfsvfs); 461 return (error); 462 } 463 464 /* 465 * Retrieve fill count from DMU object. 466 */ 467 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 468 if (error) { 469 ZFS_EXIT(zfsvfs); 470 return (error); 471 } 472 473 ndata = doi.doi_fill_count; 474 475 ZFS_EXIT(zfsvfs); 476 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 477 return (SET_ERROR(EFAULT)); 478 return (0); 479 } 480 case ZFS_IOC_FSGETXATTR: 481 return (zfs_ioctl_getxattr(vp, data, flag, cred, ct)); 482 case ZFS_IOC_FSSETXATTR: 483 return (zfs_ioctl_setxattr(vp, data, flag, cred, ct)); 484 } 485 return (SET_ERROR(ENOTTY)); 486 } 487 488 /* 489 * Utility functions to map and unmap a single physical page. These 490 * are used to manage the mappable copies of ZFS file data, and therefore 491 * do not update ref/mod bits. 492 */ 493 caddr_t 494 zfs_map_page(page_t *pp, enum seg_rw rw) 495 { 496 if (kpm_enable) 497 return (hat_kpm_mapin(pp, 0)); 498 ASSERT(rw == S_READ || rw == S_WRITE); 499 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 500 (caddr_t)-1)); 501 } 502 503 void 504 zfs_unmap_page(page_t *pp, caddr_t addr) 505 { 506 if (kpm_enable) { 507 hat_kpm_mapout(pp, 0, addr); 508 } else { 509 ppmapout(addr); 510 } 511 } 512 513 /* 514 * When a file is memory mapped, we must keep the IO data synchronized 515 * between the DMU cache and the memory mapped pages. What this means: 516 * 517 * On Write: If we find a memory mapped page, we write to *both* 518 * the page and the dmu buffer. 519 */ 520 static void 521 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) 522 { 523 int64_t off; 524 525 off = start & PAGEOFFSET; 526 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 527 page_t *pp; 528 uint64_t nbytes = MIN(PAGESIZE - off, len); 529 530 if (pp = page_lookup(vp, start, SE_SHARED)) { 531 caddr_t va; 532 533 va = zfs_map_page(pp, S_WRITE); 534 (void) dmu_read(os, oid, start+off, nbytes, va+off, 535 DMU_READ_PREFETCH); 536 zfs_unmap_page(pp, va); 537 page_unlock(pp); 538 } 539 len -= nbytes; 540 off = 0; 541 } 542 } 543 544 /* 545 * When a file is memory mapped, we must keep the IO data synchronized 546 * between the DMU cache and the memory mapped pages. What this means: 547 * 548 * On Read: We "read" preferentially from memory mapped pages, 549 * else we default from the dmu buffer. 550 * 551 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 552 * the file is memory mapped. 553 */ 554 static int 555 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 556 { 557 znode_t *zp = VTOZ(vp); 558 int64_t start, off; 559 int len = nbytes; 560 int error = 0; 561 562 start = uio->uio_loffset; 563 off = start & PAGEOFFSET; 564 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 565 page_t *pp; 566 uint64_t bytes = MIN(PAGESIZE - off, len); 567 568 if (pp = page_lookup(vp, start, SE_SHARED)) { 569 caddr_t va; 570 571 va = zfs_map_page(pp, S_READ); 572 error = uiomove(va + off, bytes, UIO_READ, uio); 573 zfs_unmap_page(pp, va); 574 page_unlock(pp); 575 } else { 576 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 577 uio, bytes); 578 } 579 len -= bytes; 580 off = 0; 581 if (error) 582 break; 583 } 584 return (error); 585 } 586 587 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 588 589 /* 590 * Read bytes from specified file into supplied buffer. 591 * 592 * IN: vp - vnode of file to be read from. 593 * uio - structure supplying read location, range info, 594 * and return buffer. 595 * ioflag - SYNC flags; used to provide FRSYNC semantics. 596 * cr - credentials of caller. 597 * ct - caller context 598 * 599 * OUT: uio - updated offset and range, buffer filled. 600 * 601 * RETURN: 0 on success, error code on failure. 602 * 603 * Side Effects: 604 * vp - atime updated if byte count > 0 605 */ 606 /* ARGSUSED */ 607 static int 608 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 609 { 610 znode_t *zp = VTOZ(vp); 611 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 612 ssize_t n, nbytes; 613 int error = 0; 614 boolean_t frsync = B_FALSE; 615 xuio_t *xuio = NULL; 616 617 ZFS_ENTER(zfsvfs); 618 ZFS_VERIFY_ZP(zp); 619 620 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 621 ZFS_EXIT(zfsvfs); 622 return (SET_ERROR(EACCES)); 623 } 624 625 /* 626 * Validate file offset 627 */ 628 if (uio->uio_loffset < (offset_t)0) { 629 ZFS_EXIT(zfsvfs); 630 return (SET_ERROR(EINVAL)); 631 } 632 633 /* 634 * Fasttrack empty reads 635 */ 636 if (uio->uio_resid == 0) { 637 ZFS_EXIT(zfsvfs); 638 return (0); 639 } 640 641 /* 642 * Check for mandatory locks 643 */ 644 if (MANDMODE(zp->z_mode)) { 645 if (error = chklock(vp, FREAD, 646 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 647 ZFS_EXIT(zfsvfs); 648 return (error); 649 } 650 } 651 652 #ifdef FRSYNC 653 /* 654 * If we're in FRSYNC mode, sync out this znode before reading it. 655 * Only do this for non-snapshots. 656 * 657 * Some platforms do not support FRSYNC and instead map it 658 * to FSYNC, which results in unnecessary calls to zil_commit. We 659 * only honor FRSYNC requests on platforms which support it. 660 */ 661 frsync = !!(ioflag & FRSYNC); 662 #endif 663 664 if (zfsvfs->z_log && 665 (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 666 zil_commit(zfsvfs->z_log, zp->z_id); 667 668 /* 669 * Lock the range against changes. 670 */ 671 locked_range_t *lr = rangelock_enter(&zp->z_rangelock, 672 uio->uio_loffset, uio->uio_resid, RL_READER); 673 674 /* 675 * If we are reading past end-of-file we can skip 676 * to the end; but we might still need to set atime. 677 */ 678 if (uio->uio_loffset >= zp->z_size) { 679 error = 0; 680 goto out; 681 } 682 683 ASSERT(uio->uio_loffset < zp->z_size); 684 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 685 686 if ((uio->uio_extflg == UIO_XUIO) && 687 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 688 int nblk; 689 int blksz = zp->z_blksz; 690 uint64_t offset = uio->uio_loffset; 691 692 xuio = (xuio_t *)uio; 693 if ((ISP2(blksz))) { 694 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 695 blksz)) / blksz; 696 } else { 697 ASSERT(offset + n <= blksz); 698 nblk = 1; 699 } 700 (void) dmu_xuio_init(xuio, nblk); 701 702 if (vn_has_cached_data(vp)) { 703 /* 704 * For simplicity, we always allocate a full buffer 705 * even if we only expect to read a portion of a block. 706 */ 707 while (--nblk >= 0) { 708 (void) dmu_xuio_add(xuio, 709 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 710 blksz), 0, blksz); 711 } 712 } 713 } 714 715 while (n > 0) { 716 nbytes = MIN(n, zfs_read_chunk_size - 717 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 718 719 if (vn_has_cached_data(vp)) { 720 error = mappedread(vp, nbytes, uio); 721 } else { 722 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 723 uio, nbytes); 724 } 725 if (error) { 726 /* convert checksum errors into IO errors */ 727 if (error == ECKSUM) 728 error = SET_ERROR(EIO); 729 break; 730 } 731 732 n -= nbytes; 733 } 734 out: 735 rangelock_exit(lr); 736 737 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 738 ZFS_EXIT(zfsvfs); 739 return (error); 740 } 741 742 static void 743 zfs_write_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, 744 cred_t *cr, boolean_t *did_check, dmu_tx_t *tx) 745 { 746 ASSERT(did_check != NULL); 747 ASSERT(tx != NULL); 748 749 if (*did_check) 750 return; 751 752 zilog_t *zilog = zfsvfs->z_log; 753 754 /* 755 * Clear Set-UID/Set-GID bits on successful write if not 756 * privileged and at least one of the execute bits is set. 757 * 758 * It would be nice to do this after all writes have 759 * been done, but that would still expose the ISUID/ISGID 760 * to another app after the partial write is committed. 761 * 762 * Note: we don't call zfs_fuid_map_id() here because 763 * user 0 is not an ephemeral uid. 764 */ 765 mutex_enter(&zp->z_acl_lock); 766 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && 767 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 768 secpolicy_vnode_setid_retain(cr, 769 ((zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0)) != 0) { 770 uint64_t newmode; 771 vattr_t va; 772 773 zp->z_mode &= ~(S_ISUID | S_ISGID); 774 newmode = zp->z_mode; 775 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 776 (void *)&newmode, sizeof (uint64_t), tx); 777 778 /* 779 * Make sure SUID/SGID bits will be removed when we replay the 780 * log. 781 */ 782 bzero(&va, sizeof (va)); 783 va.va_mask = AT_MODE; 784 va.va_nodeid = zp->z_id; 785 va.va_mode = newmode; 786 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, AT_MODE, NULL); 787 } 788 mutex_exit(&zp->z_acl_lock); 789 790 *did_check = B_TRUE; 791 } 792 793 /* 794 * Write the bytes to a file. 795 * 796 * IN: vp - vnode of file to be written to. 797 * uio - structure supplying write location, range info, 798 * and data buffer. 799 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 800 * set if in append mode. 801 * cr - credentials of caller. 802 * ct - caller context (NFS/CIFS fem monitor only) 803 * 804 * OUT: uio - updated offset and range. 805 * 806 * RETURN: 0 on success, error code on failure. 807 * 808 * Timestamps: 809 * vp - ctime|mtime updated if byte count > 0 810 */ 811 812 /* ARGSUSED */ 813 static int 814 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 815 { 816 znode_t *zp = VTOZ(vp); 817 rlim64_t limit = uio->uio_llimit; 818 ssize_t start_resid = uio->uio_resid; 819 ssize_t tx_bytes; 820 uint64_t end_size; 821 dmu_tx_t *tx; 822 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 823 zilog_t *zilog; 824 offset_t woff; 825 ssize_t n, nbytes; 826 int max_blksz = zfsvfs->z_max_blksz; 827 int error = 0; 828 int prev_error; 829 arc_buf_t *abuf; 830 iovec_t *aiov = NULL; 831 xuio_t *xuio = NULL; 832 int i_iov = 0; 833 int iovcnt = uio->uio_iovcnt; 834 iovec_t *iovp = uio->uio_iov; 835 int write_eof; 836 int count = 0; 837 sa_bulk_attr_t bulk[4]; 838 uint64_t mtime[2], ctime[2]; 839 boolean_t did_clear_setid_bits = B_FALSE; 840 841 /* 842 * Fasttrack empty write 843 */ 844 n = start_resid; 845 if (n == 0) 846 return (0); 847 848 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 849 limit = MAXOFFSET_T; 850 851 ZFS_ENTER(zfsvfs); 852 ZFS_VERIFY_ZP(zp); 853 854 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 855 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 856 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 857 &zp->z_size, 8); 858 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 859 &zp->z_pflags, 8); 860 861 /* 862 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 863 * callers might not be able to detect properly that we are read-only, 864 * so check it explicitly here. 865 */ 866 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 867 ZFS_EXIT(zfsvfs); 868 return (SET_ERROR(EROFS)); 869 } 870 871 /* 872 * If immutable or not appending then return EPERM. 873 * Intentionally allow ZFS_READONLY through here. 874 * See zfs_zaccess_common() 875 */ 876 if ((zp->z_pflags & ZFS_IMMUTABLE) || 877 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 878 (uio->uio_loffset < zp->z_size))) { 879 ZFS_EXIT(zfsvfs); 880 return (SET_ERROR(EPERM)); 881 } 882 883 zilog = zfsvfs->z_log; 884 885 /* 886 * Validate file offset 887 */ 888 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 889 if (woff < 0) { 890 ZFS_EXIT(zfsvfs); 891 return (SET_ERROR(EINVAL)); 892 } 893 894 /* 895 * Check for mandatory locks before calling rangelock_enter() 896 * in order to prevent a deadlock with locks set via fcntl(). 897 */ 898 if (MANDMODE((mode_t)zp->z_mode) && 899 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 900 ZFS_EXIT(zfsvfs); 901 return (error); 902 } 903 904 /* 905 * Pre-fault the pages to ensure slow (eg NFS) pages 906 * don't hold up txg. 907 * Skip this if uio contains loaned arc_buf. 908 */ 909 if ((uio->uio_extflg == UIO_XUIO) && 910 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 911 xuio = (xuio_t *)uio; 912 else 913 uio_prefaultpages(MIN(n, max_blksz), uio); 914 915 /* 916 * If in append mode, set the io offset pointer to eof. 917 */ 918 locked_range_t *lr; 919 if (ioflag & FAPPEND) { 920 /* 921 * Obtain an appending range lock to guarantee file append 922 * semantics. We reset the write offset once we have the lock. 923 */ 924 lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); 925 woff = lr->lr_offset; 926 if (lr->lr_length == UINT64_MAX) { 927 /* 928 * We overlocked the file because this write will cause 929 * the file block size to increase. 930 * Note that zp_size cannot change with this lock held. 931 */ 932 woff = zp->z_size; 933 } 934 uio->uio_loffset = woff; 935 } else { 936 /* 937 * Note that if the file block size will change as a result of 938 * this write, then this range lock will lock the entire file 939 * so that we can re-write the block safely. 940 */ 941 lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); 942 } 943 944 if (woff >= limit) { 945 rangelock_exit(lr); 946 ZFS_EXIT(zfsvfs); 947 return (SET_ERROR(EFBIG)); 948 } 949 950 if ((woff + n) > limit || woff > (limit - n)) 951 n = limit - woff; 952 953 /* Will this write extend the file length? */ 954 write_eof = (woff + n > zp->z_size); 955 956 end_size = MAX(zp->z_size, woff + n); 957 958 /* 959 * Write the file in reasonable size chunks. Each chunk is written 960 * in a separate transaction; this keeps the intent log records small 961 * and allows us to do more fine-grained space accounting. 962 */ 963 while (n > 0) { 964 woff = uio->uio_loffset; 965 966 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 967 zp->z_uid) || 968 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 969 zp->z_gid) || 970 (zp->z_projid != ZFS_DEFAULT_PROJID && 971 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 972 zp->z_projid))) { 973 error = SET_ERROR(EDQUOT); 974 break; 975 } 976 977 arc_buf_t *abuf = NULL; 978 if (xuio) { 979 ASSERT(i_iov < iovcnt); 980 aiov = &iovp[i_iov]; 981 abuf = dmu_xuio_arcbuf(xuio, i_iov); 982 dmu_xuio_clear(xuio, i_iov); 983 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 984 iovec_t *, aiov, arc_buf_t *, abuf); 985 ASSERT((aiov->iov_base == abuf->b_data) || 986 ((char *)aiov->iov_base - (char *)abuf->b_data + 987 aiov->iov_len == arc_buf_size(abuf))); 988 i_iov++; 989 } else if (n >= max_blksz && woff >= zp->z_size && 990 P2PHASE(woff, max_blksz) == 0 && 991 zp->z_blksz == max_blksz) { 992 /* 993 * This write covers a full block. "Borrow" a buffer 994 * from the dmu so that we can fill it before we enter 995 * a transaction. This avoids the possibility of 996 * holding up the transaction if the data copy hangs 997 * up on a pagefault (e.g., from an NFS server mapping). 998 */ 999 size_t cbytes; 1000 1001 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1002 max_blksz); 1003 ASSERT(abuf != NULL); 1004 ASSERT(arc_buf_size(abuf) == max_blksz); 1005 if (error = uiocopy(abuf->b_data, max_blksz, 1006 UIO_WRITE, uio, &cbytes)) { 1007 dmu_return_arcbuf(abuf); 1008 break; 1009 } 1010 ASSERT(cbytes == max_blksz); 1011 } 1012 1013 /* 1014 * Start a transaction. 1015 */ 1016 tx = dmu_tx_create(zfsvfs->z_os); 1017 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1018 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1019 zfs_sa_upgrade_txholds(tx, zp); 1020 error = dmu_tx_assign(tx, TXG_WAIT); 1021 if (error) { 1022 dmu_tx_abort(tx); 1023 if (abuf != NULL) 1024 dmu_return_arcbuf(abuf); 1025 break; 1026 } 1027 1028 /* 1029 * NB: We must call zfs_write_clear_setid_bits_if_necessary 1030 * before committing the transaction! 1031 */ 1032 1033 /* 1034 * If rangelock_enter() over-locked we grow the blocksize 1035 * and then reduce the lock range. This will only happen 1036 * on the first iteration since rangelock_reduce() will 1037 * shrink down lr_length to the appropriate size. 1038 */ 1039 if (lr->lr_length == UINT64_MAX) { 1040 uint64_t new_blksz; 1041 1042 if (zp->z_blksz > max_blksz) { 1043 /* 1044 * File's blocksize is already larger than the 1045 * "recordsize" property. Only let it grow to 1046 * the next power of 2. 1047 */ 1048 ASSERT(!ISP2(zp->z_blksz)); 1049 new_blksz = MIN(end_size, 1050 1 << highbit64(zp->z_blksz)); 1051 } else { 1052 new_blksz = MIN(end_size, max_blksz); 1053 } 1054 zfs_grow_blocksize(zp, new_blksz, tx); 1055 rangelock_reduce(lr, woff, n); 1056 } 1057 1058 /* 1059 * XXX - should we really limit each write to z_max_blksz? 1060 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1061 */ 1062 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1063 1064 if (abuf == NULL) { 1065 tx_bytes = uio->uio_resid; 1066 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1067 uio, nbytes, tx); 1068 tx_bytes -= uio->uio_resid; 1069 } else { 1070 tx_bytes = nbytes; 1071 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1072 /* 1073 * If this is not a full block write, but we are 1074 * extending the file past EOF and this data starts 1075 * block-aligned, use assign_arcbuf(). Otherwise, 1076 * write via dmu_write(). 1077 */ 1078 if (tx_bytes < max_blksz && (!write_eof || 1079 aiov->iov_base != abuf->b_data)) { 1080 ASSERT(xuio); 1081 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1082 aiov->iov_len, aiov->iov_base, tx); 1083 dmu_return_arcbuf(abuf); 1084 xuio_stat_wbuf_copied(); 1085 } else { 1086 ASSERT(xuio || tx_bytes == max_blksz); 1087 dmu_assign_arcbuf_by_dbuf( 1088 sa_get_db(zp->z_sa_hdl), woff, abuf, tx); 1089 } 1090 ASSERT(tx_bytes <= uio->uio_resid); 1091 uioskip(uio, tx_bytes); 1092 } 1093 if (tx_bytes && vn_has_cached_data(vp)) { 1094 update_pages(vp, woff, 1095 tx_bytes, zfsvfs->z_os, zp->z_id); 1096 } 1097 1098 /* 1099 * If we made no progress, we're done. If we made even 1100 * partial progress, update the znode and ZIL accordingly. 1101 */ 1102 if (tx_bytes == 0) { 1103 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1104 (void *)&zp->z_size, sizeof (uint64_t), tx); 1105 dmu_tx_commit(tx); 1106 ASSERT(error != 0); 1107 break; 1108 } 1109 1110 zfs_write_clear_setid_bits_if_necessary(zfsvfs, zp, cr, 1111 &did_clear_setid_bits, tx); 1112 1113 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1114 B_TRUE); 1115 1116 /* 1117 * Update the file size (zp_size) if it has changed; 1118 * account for possible concurrent updates. 1119 */ 1120 while ((end_size = zp->z_size) < uio->uio_loffset) { 1121 (void) atomic_cas_64(&zp->z_size, end_size, 1122 uio->uio_loffset); 1123 } 1124 /* 1125 * If we are replaying and eof is non zero then force 1126 * the file size to the specified eof. Note, there's no 1127 * concurrency during replay. 1128 */ 1129 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1130 zp->z_size = zfsvfs->z_replay_eof; 1131 1132 /* 1133 * Keep track of a possible pre-existing error from a partial 1134 * write via dmu_write_uio_dbuf above. 1135 */ 1136 prev_error = error; 1137 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1138 1139 /* 1140 * NB: During replay, the TX_SETATTR record logged by 1141 * zfs_write_clear_setid_bits_if_necessary must precede 1142 * any of the TX_WRITE records logged here. 1143 */ 1144 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1145 dmu_tx_commit(tx); 1146 1147 if (prev_error != 0 || error != 0) 1148 break; 1149 ASSERT(tx_bytes == nbytes); 1150 n -= nbytes; 1151 1152 if (!xuio && n > 0) 1153 uio_prefaultpages(MIN(n, max_blksz), uio); 1154 } 1155 1156 rangelock_exit(lr); 1157 1158 /* 1159 * If we're in replay mode, or we made no progress, return error. 1160 * Otherwise, it's at least a partial write, so it's successful. 1161 */ 1162 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1163 ZFS_EXIT(zfsvfs); 1164 return (error); 1165 } 1166 1167 if (ioflag & (FSYNC | FDSYNC) || 1168 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1169 zil_commit(zilog, zp->z_id); 1170 1171 ZFS_EXIT(zfsvfs); 1172 return (0); 1173 } 1174 1175 /* ARGSUSED */ 1176 void 1177 zfs_get_done(zgd_t *zgd, int error) 1178 { 1179 znode_t *zp = zgd->zgd_private; 1180 objset_t *os = zp->z_zfsvfs->z_os; 1181 1182 if (zgd->zgd_db) 1183 dmu_buf_rele(zgd->zgd_db, zgd); 1184 1185 rangelock_exit(zgd->zgd_lr); 1186 1187 /* 1188 * Release the vnode asynchronously as we currently have the 1189 * txg stopped from syncing. 1190 */ 1191 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1192 1193 kmem_free(zgd, sizeof (zgd_t)); 1194 } 1195 1196 #ifdef DEBUG 1197 static int zil_fault_io = 0; 1198 #endif 1199 1200 /* 1201 * Get data to generate a TX_WRITE intent log record. 1202 */ 1203 int 1204 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) 1205 { 1206 zfsvfs_t *zfsvfs = arg; 1207 objset_t *os = zfsvfs->z_os; 1208 znode_t *zp; 1209 uint64_t object = lr->lr_foid; 1210 uint64_t offset = lr->lr_offset; 1211 uint64_t size = lr->lr_length; 1212 dmu_buf_t *db; 1213 zgd_t *zgd; 1214 int error = 0; 1215 1216 ASSERT3P(lwb, !=, NULL); 1217 ASSERT3P(zio, !=, NULL); 1218 ASSERT3U(size, !=, 0); 1219 1220 /* 1221 * Nothing to do if the file has been removed 1222 */ 1223 if (zfs_zget(zfsvfs, object, &zp) != 0) 1224 return (SET_ERROR(ENOENT)); 1225 if (zp->z_unlinked) { 1226 /* 1227 * Release the vnode asynchronously as we currently have the 1228 * txg stopped from syncing. 1229 */ 1230 VN_RELE_ASYNC(ZTOV(zp), 1231 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1232 return (SET_ERROR(ENOENT)); 1233 } 1234 1235 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1236 zgd->zgd_lwb = lwb; 1237 zgd->zgd_private = zp; 1238 1239 /* 1240 * Write records come in two flavors: immediate and indirect. 1241 * For small writes it's cheaper to store the data with the 1242 * log record (immediate); for large writes it's cheaper to 1243 * sync the data and get a pointer to it (indirect) so that 1244 * we don't have to write the data twice. 1245 */ 1246 if (buf != NULL) { /* immediate write */ 1247 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, 1248 offset, size, RL_READER); 1249 /* test for truncation needs to be done while range locked */ 1250 if (offset >= zp->z_size) { 1251 error = SET_ERROR(ENOENT); 1252 } else { 1253 error = dmu_read(os, object, offset, size, buf, 1254 DMU_READ_NO_PREFETCH); 1255 } 1256 ASSERT(error == 0 || error == ENOENT); 1257 } else { /* indirect write */ 1258 /* 1259 * Have to lock the whole block to ensure when it's 1260 * written out and its checksum is being calculated 1261 * that no one can change the data. We need to re-check 1262 * blocksize after we get the lock in case it's changed! 1263 */ 1264 for (;;) { 1265 uint64_t blkoff; 1266 size = zp->z_blksz; 1267 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1268 offset -= blkoff; 1269 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, 1270 offset, size, RL_READER); 1271 if (zp->z_blksz == size) 1272 break; 1273 offset += blkoff; 1274 rangelock_exit(zgd->zgd_lr); 1275 } 1276 /* test for truncation needs to be done while range locked */ 1277 if (lr->lr_offset >= zp->z_size) 1278 error = SET_ERROR(ENOENT); 1279 #ifdef DEBUG 1280 if (zil_fault_io) { 1281 error = SET_ERROR(EIO); 1282 zil_fault_io = 0; 1283 } 1284 #endif 1285 if (error == 0) 1286 error = dmu_buf_hold(os, object, offset, zgd, &db, 1287 DMU_READ_NO_PREFETCH); 1288 1289 if (error == 0) { 1290 blkptr_t *bp = &lr->lr_blkptr; 1291 1292 zgd->zgd_db = db; 1293 zgd->zgd_bp = bp; 1294 1295 ASSERT(db->db_offset == offset); 1296 ASSERT(db->db_size == size); 1297 1298 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1299 zfs_get_done, zgd); 1300 ASSERT(error || lr->lr_length <= size); 1301 1302 /* 1303 * On success, we need to wait for the write I/O 1304 * initiated by dmu_sync() to complete before we can 1305 * release this dbuf. We will finish everything up 1306 * in the zfs_get_done() callback. 1307 */ 1308 if (error == 0) 1309 return (0); 1310 1311 if (error == EALREADY) { 1312 lr->lr_common.lrc_txtype = TX_WRITE2; 1313 /* 1314 * TX_WRITE2 relies on the data previously 1315 * written by the TX_WRITE that caused 1316 * EALREADY. We zero out the BP because 1317 * it is the old, currently-on-disk BP. 1318 */ 1319 zgd->zgd_bp = NULL; 1320 BP_ZERO(bp); 1321 error = 0; 1322 } 1323 } 1324 } 1325 1326 zfs_get_done(zgd, error); 1327 1328 return (error); 1329 } 1330 1331 /*ARGSUSED*/ 1332 static int 1333 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1334 caller_context_t *ct) 1335 { 1336 znode_t *zp = VTOZ(vp); 1337 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1338 int error; 1339 1340 ZFS_ENTER(zfsvfs); 1341 ZFS_VERIFY_ZP(zp); 1342 1343 if (flag & V_ACE_MASK) 1344 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1345 else 1346 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1347 1348 ZFS_EXIT(zfsvfs); 1349 return (error); 1350 } 1351 1352 /* 1353 * If vnode is for a device return a specfs vnode instead. 1354 */ 1355 static int 1356 specvp_check(vnode_t **vpp, cred_t *cr) 1357 { 1358 int error = 0; 1359 1360 if (IS_DEVVP(*vpp)) { 1361 struct vnode *svp; 1362 1363 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1364 VN_RELE(*vpp); 1365 if (svp == NULL) 1366 error = SET_ERROR(ENOSYS); 1367 *vpp = svp; 1368 } 1369 return (error); 1370 } 1371 1372 1373 /* 1374 * Lookup an entry in a directory, or an extended attribute directory. 1375 * If it exists, return a held vnode reference for it. 1376 * 1377 * IN: dvp - vnode of directory to search. 1378 * nm - name of entry to lookup. 1379 * pnp - full pathname to lookup [UNUSED]. 1380 * flags - LOOKUP_XATTR set if looking for an attribute. 1381 * rdir - root directory vnode [UNUSED]. 1382 * cr - credentials of caller. 1383 * ct - caller context 1384 * direntflags - directory lookup flags 1385 * realpnp - returned pathname. 1386 * 1387 * OUT: vpp - vnode of located entry, NULL if not found. 1388 * 1389 * RETURN: 0 on success, error code on failure. 1390 * 1391 * Timestamps: 1392 * NA 1393 */ 1394 /* ARGSUSED */ 1395 static int 1396 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1397 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1398 int *direntflags, pathname_t *realpnp) 1399 { 1400 znode_t *zdp = VTOZ(dvp); 1401 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1402 int error = 0; 1403 boolean_t skipaclchk = ((flags & ATTR_NOACLCHECK) != 0); 1404 1405 /* 1406 * ATTR_NOACLCHECK is specified to skip EXECUTE checks for 1407 * consumers (like SMB) that bypass traverse checking. 1408 * Turn it off here so it can't accidentally be used 1409 * for other checks. 1410 */ 1411 flags &= ~ATTR_NOACLCHECK; 1412 1413 /* 1414 * Fast path lookup, however we must skip DNLC lookup 1415 * for case folding or normalizing lookups because the 1416 * DNLC code only stores the passed in name. This means 1417 * creating 'a' and removing 'A' on a case insensitive 1418 * file system would work, but DNLC still thinks 'a' 1419 * exists and won't let you create it again on the next 1420 * pass through fast path. 1421 */ 1422 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 1423 1424 if (dvp->v_type != VDIR) { 1425 return (SET_ERROR(ENOTDIR)); 1426 } else if (zdp->z_sa_hdl == NULL) { 1427 return (SET_ERROR(EIO)); 1428 } 1429 1430 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1431 error = zfs_fastaccesschk_execute(zdp, cr, skipaclchk); 1432 if (!error) { 1433 *vpp = dvp; 1434 VN_HOLD(*vpp); 1435 return (0); 1436 } 1437 return (error); 1438 } else if (!zdp->z_zfsvfs->z_norm && 1439 (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) { 1440 1441 vnode_t *tvp = dnlc_lookup(dvp, nm); 1442 1443 if (tvp) { 1444 error = zfs_fastaccesschk_execute(zdp, cr, 1445 skipaclchk); 1446 if (error) { 1447 VN_RELE(tvp); 1448 return (error); 1449 } 1450 if (tvp == DNLC_NO_VNODE) { 1451 VN_RELE(tvp); 1452 return (SET_ERROR(ENOENT)); 1453 } else { 1454 *vpp = tvp; 1455 return (specvp_check(vpp, cr)); 1456 } 1457 } 1458 } 1459 } 1460 1461 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1462 1463 ZFS_ENTER(zfsvfs); 1464 ZFS_VERIFY_ZP(zdp); 1465 1466 *vpp = NULL; 1467 1468 if (flags & LOOKUP_XATTR) { 1469 /* 1470 * If the xattr property is off, refuse the lookup request. 1471 */ 1472 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1473 ZFS_EXIT(zfsvfs); 1474 return (SET_ERROR(EINVAL)); 1475 } 1476 1477 /* 1478 * We don't allow recursive attributes.. 1479 * Maybe someday we will. 1480 */ 1481 if (zdp->z_pflags & ZFS_XATTR) { 1482 ZFS_EXIT(zfsvfs); 1483 return (SET_ERROR(EINVAL)); 1484 } 1485 1486 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1487 ZFS_EXIT(zfsvfs); 1488 return (error); 1489 } 1490 1491 /* 1492 * Do we have permission to get into attribute directory? 1493 */ 1494 1495 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1496 skipaclchk, cr)) { 1497 VN_RELE(*vpp); 1498 *vpp = NULL; 1499 } 1500 1501 ZFS_EXIT(zfsvfs); 1502 return (error); 1503 } 1504 1505 if (dvp->v_type != VDIR) { 1506 ZFS_EXIT(zfsvfs); 1507 return (SET_ERROR(ENOTDIR)); 1508 } 1509 1510 /* 1511 * Check accessibility of directory. 1512 */ 1513 1514 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, skipaclchk, cr)) { 1515 ZFS_EXIT(zfsvfs); 1516 return (error); 1517 } 1518 1519 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1520 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1521 ZFS_EXIT(zfsvfs); 1522 return (SET_ERROR(EILSEQ)); 1523 } 1524 1525 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1526 if (error == 0) 1527 error = specvp_check(vpp, cr); 1528 1529 ZFS_EXIT(zfsvfs); 1530 return (error); 1531 } 1532 1533 /* 1534 * Attempt to create a new entry in a directory. If the entry 1535 * already exists, truncate the file if permissible, else return 1536 * an error. Return the vp of the created or trunc'd file. 1537 * 1538 * IN: dvp - vnode of directory to put new file entry in. 1539 * name - name of new file entry. 1540 * vap - attributes of new file. 1541 * excl - flag indicating exclusive or non-exclusive mode. 1542 * mode - mode to open file with. 1543 * cr - credentials of caller. 1544 * flag - large file flag [UNUSED]. 1545 * ct - caller context 1546 * vsecp - ACL to be set 1547 * 1548 * OUT: vpp - vnode of created or trunc'd entry. 1549 * 1550 * RETURN: 0 on success, error code on failure. 1551 * 1552 * Timestamps: 1553 * dvp - ctime|mtime updated if new entry created 1554 * vp - ctime|mtime always, atime if new 1555 */ 1556 1557 /* ARGSUSED */ 1558 static int 1559 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1560 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1561 vsecattr_t *vsecp) 1562 { 1563 znode_t *zp, *dzp = VTOZ(dvp); 1564 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1565 zilog_t *zilog; 1566 objset_t *os; 1567 zfs_dirlock_t *dl; 1568 dmu_tx_t *tx; 1569 int error; 1570 ksid_t *ksid; 1571 uid_t uid; 1572 gid_t gid = crgetgid(cr); 1573 zfs_acl_ids_t acl_ids; 1574 boolean_t fuid_dirtied; 1575 boolean_t have_acl = B_FALSE; 1576 boolean_t waited = B_FALSE; 1577 1578 /* 1579 * If we have an ephemeral id, ACL, or XVATTR then 1580 * make sure file system is at proper version 1581 */ 1582 1583 ksid = crgetsid(cr, KSID_OWNER); 1584 if (ksid) 1585 uid = ksid_getid(ksid); 1586 else 1587 uid = crgetuid(cr); 1588 1589 if (zfsvfs->z_use_fuids == B_FALSE && 1590 (vsecp || (vap->va_mask & AT_XVATTR) || 1591 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1592 return (SET_ERROR(EINVAL)); 1593 1594 ZFS_ENTER(zfsvfs); 1595 ZFS_VERIFY_ZP(dzp); 1596 os = zfsvfs->z_os; 1597 zilog = zfsvfs->z_log; 1598 1599 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1600 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1601 ZFS_EXIT(zfsvfs); 1602 return (SET_ERROR(EILSEQ)); 1603 } 1604 1605 if (vap->va_mask & AT_XVATTR) { 1606 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1607 crgetuid(cr), cr, vap->va_type)) != 0) { 1608 ZFS_EXIT(zfsvfs); 1609 return (error); 1610 } 1611 } 1612 top: 1613 *vpp = NULL; 1614 1615 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1616 vap->va_mode &= ~VSVTX; 1617 1618 if (*name == '\0') { 1619 /* 1620 * Null component name refers to the directory itself. 1621 */ 1622 VN_HOLD(dvp); 1623 zp = dzp; 1624 dl = NULL; 1625 error = 0; 1626 } else { 1627 /* possible VN_HOLD(zp) */ 1628 int zflg = 0; 1629 1630 if (flag & FIGNORECASE) 1631 zflg |= ZCILOOK; 1632 1633 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1634 NULL, NULL); 1635 if (error) { 1636 if (have_acl) 1637 zfs_acl_ids_free(&acl_ids); 1638 if (strcmp(name, "..") == 0) 1639 error = SET_ERROR(EISDIR); 1640 ZFS_EXIT(zfsvfs); 1641 return (error); 1642 } 1643 } 1644 1645 if (zp == NULL) { 1646 uint64_t txtype; 1647 uint64_t projid = ZFS_DEFAULT_PROJID; 1648 1649 /* 1650 * Create a new file object and update the directory 1651 * to reference it. 1652 */ 1653 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1654 if (have_acl) 1655 zfs_acl_ids_free(&acl_ids); 1656 goto out; 1657 } 1658 1659 /* 1660 * We only support the creation of regular files in 1661 * extended attribute directories. 1662 */ 1663 1664 if ((dzp->z_pflags & ZFS_XATTR) && 1665 (vap->va_type != VREG)) { 1666 if (have_acl) 1667 zfs_acl_ids_free(&acl_ids); 1668 error = SET_ERROR(EINVAL); 1669 goto out; 1670 } 1671 1672 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 1673 cr, vsecp, &acl_ids)) != 0) 1674 goto out; 1675 have_acl = B_TRUE; 1676 1677 if (vap->va_type == VREG || vap->va_type == VDIR) 1678 projid = zfs_inherit_projid(dzp); 1679 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 1680 zfs_acl_ids_free(&acl_ids); 1681 error = SET_ERROR(EDQUOT); 1682 goto out; 1683 } 1684 1685 tx = dmu_tx_create(os); 1686 1687 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1688 ZFS_SA_BASE_ATTR_SIZE); 1689 1690 fuid_dirtied = zfsvfs->z_fuid_dirty; 1691 if (fuid_dirtied) 1692 zfs_fuid_txhold(zfsvfs, tx); 1693 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1694 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1695 if (!zfsvfs->z_use_sa && 1696 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1697 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1698 0, acl_ids.z_aclp->z_acl_bytes); 1699 } 1700 error = dmu_tx_assign(tx, 1701 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1702 if (error) { 1703 zfs_dirent_unlock(dl); 1704 if (error == ERESTART) { 1705 waited = B_TRUE; 1706 dmu_tx_wait(tx); 1707 dmu_tx_abort(tx); 1708 goto top; 1709 } 1710 zfs_acl_ids_free(&acl_ids); 1711 dmu_tx_abort(tx); 1712 ZFS_EXIT(zfsvfs); 1713 return (error); 1714 } 1715 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1716 1717 if (fuid_dirtied) 1718 zfs_fuid_sync(zfsvfs, tx); 1719 1720 (void) zfs_link_create(dl, zp, tx, ZNEW); 1721 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1722 if (flag & FIGNORECASE) 1723 txtype |= TX_CI; 1724 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1725 vsecp, acl_ids.z_fuidp, vap); 1726 zfs_acl_ids_free(&acl_ids); 1727 dmu_tx_commit(tx); 1728 } else { 1729 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1730 1731 if (have_acl) 1732 zfs_acl_ids_free(&acl_ids); 1733 have_acl = B_FALSE; 1734 1735 /* 1736 * A directory entry already exists for this name. 1737 */ 1738 /* 1739 * Can't truncate an existing file if in exclusive mode. 1740 */ 1741 if (excl == EXCL) { 1742 error = SET_ERROR(EEXIST); 1743 goto out; 1744 } 1745 /* 1746 * Can't open a directory for writing. 1747 */ 1748 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1749 error = SET_ERROR(EISDIR); 1750 goto out; 1751 } 1752 /* 1753 * Verify requested access to file. 1754 */ 1755 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1756 goto out; 1757 } 1758 1759 mutex_enter(&dzp->z_lock); 1760 dzp->z_seq++; 1761 mutex_exit(&dzp->z_lock); 1762 1763 /* 1764 * Truncate regular files if requested. 1765 */ 1766 if ((ZTOV(zp)->v_type == VREG) && 1767 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1768 /* we can't hold any locks when calling zfs_freesp() */ 1769 zfs_dirent_unlock(dl); 1770 dl = NULL; 1771 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1772 if (error == 0) { 1773 vnevent_create(ZTOV(zp), ct); 1774 } 1775 } 1776 } 1777 out: 1778 1779 if (dl) 1780 zfs_dirent_unlock(dl); 1781 1782 if (error) { 1783 if (zp) 1784 VN_RELE(ZTOV(zp)); 1785 } else { 1786 *vpp = ZTOV(zp); 1787 error = specvp_check(vpp, cr); 1788 } 1789 1790 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1791 zil_commit(zilog, 0); 1792 1793 ZFS_EXIT(zfsvfs); 1794 return (error); 1795 } 1796 1797 /* 1798 * Remove an entry from a directory. 1799 * 1800 * IN: dvp - vnode of directory to remove entry from. 1801 * name - name of entry to remove. 1802 * cr - credentials of caller. 1803 * ct - caller context 1804 * flags - case flags 1805 * 1806 * RETURN: 0 on success, error code on failure. 1807 * 1808 * Timestamps: 1809 * dvp - ctime|mtime 1810 * vp - ctime (if nlink > 0) 1811 */ 1812 1813 uint64_t null_xattr = 0; 1814 1815 /*ARGSUSED*/ 1816 static int 1817 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1818 int flags) 1819 { 1820 znode_t *zp, *dzp = VTOZ(dvp); 1821 znode_t *xzp; 1822 vnode_t *vp; 1823 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1824 zilog_t *zilog; 1825 uint64_t acl_obj, xattr_obj; 1826 uint64_t xattr_obj_unlinked = 0; 1827 uint64_t obj = 0; 1828 zfs_dirlock_t *dl; 1829 dmu_tx_t *tx; 1830 boolean_t may_delete_now, delete_now = FALSE; 1831 boolean_t unlinked, toobig = FALSE; 1832 uint64_t txtype; 1833 pathname_t *realnmp = NULL; 1834 pathname_t realnm; 1835 int error; 1836 int zflg = ZEXISTS; 1837 boolean_t waited = B_FALSE; 1838 1839 ZFS_ENTER(zfsvfs); 1840 ZFS_VERIFY_ZP(dzp); 1841 zilog = zfsvfs->z_log; 1842 1843 if (flags & FIGNORECASE) { 1844 zflg |= ZCILOOK; 1845 pn_alloc(&realnm); 1846 realnmp = &realnm; 1847 } 1848 1849 top: 1850 xattr_obj = 0; 1851 xzp = NULL; 1852 /* 1853 * Attempt to lock directory; fail if entry doesn't exist. 1854 */ 1855 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1856 NULL, realnmp)) { 1857 if (realnmp) 1858 pn_free(realnmp); 1859 ZFS_EXIT(zfsvfs); 1860 return (error); 1861 } 1862 1863 vp = ZTOV(zp); 1864 1865 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1866 goto out; 1867 } 1868 1869 /* 1870 * Need to use rmdir for removing directories. 1871 */ 1872 if (vp->v_type == VDIR) { 1873 error = SET_ERROR(EPERM); 1874 goto out; 1875 } 1876 1877 vnevent_remove(vp, dvp, name, ct); 1878 1879 if (realnmp) 1880 dnlc_remove(dvp, realnmp->pn_buf); 1881 else 1882 dnlc_remove(dvp, name); 1883 1884 mutex_enter(&vp->v_lock); 1885 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1886 mutex_exit(&vp->v_lock); 1887 1888 /* 1889 * We may delete the znode now, or we may put it in the unlinked set; 1890 * it depends on whether we're the last link, and on whether there are 1891 * other holds on the vnode. So we dmu_tx_hold() the right things to 1892 * allow for either case. 1893 */ 1894 obj = zp->z_id; 1895 tx = dmu_tx_create(zfsvfs->z_os); 1896 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1897 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1898 zfs_sa_upgrade_txholds(tx, zp); 1899 zfs_sa_upgrade_txholds(tx, dzp); 1900 if (may_delete_now) { 1901 toobig = 1902 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1903 /* if the file is too big, only hold_free a token amount */ 1904 dmu_tx_hold_free(tx, zp->z_id, 0, 1905 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1906 } 1907 1908 /* are there any extended attributes? */ 1909 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1910 &xattr_obj, sizeof (xattr_obj)); 1911 if (error == 0 && xattr_obj) { 1912 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1913 ASSERT0(error); 1914 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1915 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1916 } 1917 1918 mutex_enter(&zp->z_lock); 1919 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1920 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1921 mutex_exit(&zp->z_lock); 1922 1923 /* charge as an update -- would be nice not to charge at all */ 1924 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1925 1926 /* 1927 * Mark this transaction as typically resulting in a net free of space 1928 */ 1929 dmu_tx_mark_netfree(tx); 1930 1931 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1932 if (error) { 1933 zfs_dirent_unlock(dl); 1934 VN_RELE(vp); 1935 if (xzp) 1936 VN_RELE(ZTOV(xzp)); 1937 if (error == ERESTART) { 1938 waited = B_TRUE; 1939 dmu_tx_wait(tx); 1940 dmu_tx_abort(tx); 1941 goto top; 1942 } 1943 if (realnmp) 1944 pn_free(realnmp); 1945 dmu_tx_abort(tx); 1946 ZFS_EXIT(zfsvfs); 1947 return (error); 1948 } 1949 1950 /* 1951 * Remove the directory entry. 1952 */ 1953 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1954 1955 if (error) { 1956 dmu_tx_commit(tx); 1957 goto out; 1958 } 1959 1960 if (unlinked) { 1961 /* 1962 * Hold z_lock so that we can make sure that the ACL obj 1963 * hasn't changed. Could have been deleted due to 1964 * zfs_sa_upgrade(). 1965 */ 1966 mutex_enter(&zp->z_lock); 1967 mutex_enter(&vp->v_lock); 1968 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1969 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1970 delete_now = may_delete_now && !toobig && 1971 vp->v_count == 1 && !vn_has_cached_data(vp) && 1972 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == 1973 acl_obj; 1974 mutex_exit(&vp->v_lock); 1975 } 1976 1977 if (delete_now) { 1978 if (xattr_obj_unlinked) { 1979 ASSERT3U(xzp->z_links, ==, 2); 1980 mutex_enter(&xzp->z_lock); 1981 xzp->z_unlinked = 1; 1982 xzp->z_links = 0; 1983 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1984 &xzp->z_links, sizeof (xzp->z_links), tx); 1985 ASSERT3U(error, ==, 0); 1986 mutex_exit(&xzp->z_lock); 1987 zfs_unlinked_add(xzp, tx); 1988 1989 if (zp->z_is_sa) 1990 error = sa_remove(zp->z_sa_hdl, 1991 SA_ZPL_XATTR(zfsvfs), tx); 1992 else 1993 error = sa_update(zp->z_sa_hdl, 1994 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1995 sizeof (uint64_t), tx); 1996 ASSERT0(error); 1997 } 1998 mutex_enter(&vp->v_lock); 1999 VN_RELE_LOCKED(vp); 2000 ASSERT0(vp->v_count); 2001 mutex_exit(&vp->v_lock); 2002 mutex_exit(&zp->z_lock); 2003 zfs_znode_delete(zp, tx); 2004 } else if (unlinked) { 2005 mutex_exit(&zp->z_lock); 2006 zfs_unlinked_add(zp, tx); 2007 } 2008 2009 txtype = TX_REMOVE; 2010 if (flags & FIGNORECASE) 2011 txtype |= TX_CI; 2012 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 2013 2014 dmu_tx_commit(tx); 2015 out: 2016 if (realnmp) 2017 pn_free(realnmp); 2018 2019 zfs_dirent_unlock(dl); 2020 2021 if (!delete_now) 2022 VN_RELE(vp); 2023 if (xzp) 2024 VN_RELE(ZTOV(xzp)); 2025 2026 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2027 zil_commit(zilog, 0); 2028 2029 ZFS_EXIT(zfsvfs); 2030 return (error); 2031 } 2032 2033 /* 2034 * Create a new directory and insert it into dvp using the name 2035 * provided. Return a pointer to the inserted directory. 2036 * 2037 * IN: dvp - vnode of directory to add subdir to. 2038 * dirname - name of new directory. 2039 * vap - attributes of new directory. 2040 * cr - credentials of caller. 2041 * ct - caller context 2042 * flags - case flags 2043 * vsecp - ACL to be set 2044 * 2045 * OUT: vpp - vnode of created directory. 2046 * 2047 * RETURN: 0 on success, error code on failure. 2048 * 2049 * Timestamps: 2050 * dvp - ctime|mtime updated 2051 * vp - ctime|mtime|atime updated 2052 */ 2053 /*ARGSUSED*/ 2054 static int 2055 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 2056 caller_context_t *ct, int flags, vsecattr_t *vsecp) 2057 { 2058 znode_t *zp, *dzp = VTOZ(dvp); 2059 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2060 zilog_t *zilog; 2061 zfs_dirlock_t *dl; 2062 uint64_t txtype; 2063 dmu_tx_t *tx; 2064 int error; 2065 int zf = ZNEW; 2066 ksid_t *ksid; 2067 uid_t uid; 2068 gid_t gid = crgetgid(cr); 2069 zfs_acl_ids_t acl_ids; 2070 boolean_t fuid_dirtied; 2071 boolean_t waited = B_FALSE; 2072 2073 ASSERT(vap->va_type == VDIR); 2074 2075 /* 2076 * If we have an ephemeral id, ACL, or XVATTR then 2077 * make sure file system is at proper version 2078 */ 2079 2080 ksid = crgetsid(cr, KSID_OWNER); 2081 if (ksid) 2082 uid = ksid_getid(ksid); 2083 else 2084 uid = crgetuid(cr); 2085 if (zfsvfs->z_use_fuids == B_FALSE && 2086 (vsecp || (vap->va_mask & AT_XVATTR) || 2087 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2088 return (SET_ERROR(EINVAL)); 2089 2090 ZFS_ENTER(zfsvfs); 2091 ZFS_VERIFY_ZP(dzp); 2092 zilog = zfsvfs->z_log; 2093 2094 if (dzp->z_pflags & ZFS_XATTR) { 2095 ZFS_EXIT(zfsvfs); 2096 return (SET_ERROR(EINVAL)); 2097 } 2098 2099 if (zfsvfs->z_utf8 && u8_validate(dirname, 2100 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2101 ZFS_EXIT(zfsvfs); 2102 return (SET_ERROR(EILSEQ)); 2103 } 2104 if (flags & FIGNORECASE) 2105 zf |= ZCILOOK; 2106 2107 if (vap->va_mask & AT_XVATTR) { 2108 if ((error = secpolicy_xvattr((xvattr_t *)vap, 2109 crgetuid(cr), cr, vap->va_type)) != 0) { 2110 ZFS_EXIT(zfsvfs); 2111 return (error); 2112 } 2113 } 2114 2115 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2116 vsecp, &acl_ids)) != 0) { 2117 ZFS_EXIT(zfsvfs); 2118 return (error); 2119 } 2120 /* 2121 * First make sure the new directory doesn't exist. 2122 * 2123 * Existence is checked first to make sure we don't return 2124 * EACCES instead of EEXIST which can cause some applications 2125 * to fail. 2126 */ 2127 top: 2128 *vpp = NULL; 2129 2130 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 2131 NULL, NULL)) { 2132 zfs_acl_ids_free(&acl_ids); 2133 ZFS_EXIT(zfsvfs); 2134 return (error); 2135 } 2136 2137 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2138 zfs_acl_ids_free(&acl_ids); 2139 zfs_dirent_unlock(dl); 2140 ZFS_EXIT(zfsvfs); 2141 return (error); 2142 } 2143 2144 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 2145 zfs_acl_ids_free(&acl_ids); 2146 zfs_dirent_unlock(dl); 2147 ZFS_EXIT(zfsvfs); 2148 return (SET_ERROR(EDQUOT)); 2149 } 2150 2151 /* 2152 * Add a new entry to the directory. 2153 */ 2154 tx = dmu_tx_create(zfsvfs->z_os); 2155 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2156 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2157 fuid_dirtied = zfsvfs->z_fuid_dirty; 2158 if (fuid_dirtied) 2159 zfs_fuid_txhold(zfsvfs, tx); 2160 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2161 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2162 acl_ids.z_aclp->z_acl_bytes); 2163 } 2164 2165 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2166 ZFS_SA_BASE_ATTR_SIZE); 2167 2168 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 2169 if (error) { 2170 zfs_dirent_unlock(dl); 2171 if (error == ERESTART) { 2172 waited = B_TRUE; 2173 dmu_tx_wait(tx); 2174 dmu_tx_abort(tx); 2175 goto top; 2176 } 2177 zfs_acl_ids_free(&acl_ids); 2178 dmu_tx_abort(tx); 2179 ZFS_EXIT(zfsvfs); 2180 return (error); 2181 } 2182 2183 /* 2184 * Create new node. 2185 */ 2186 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2187 2188 if (fuid_dirtied) 2189 zfs_fuid_sync(zfsvfs, tx); 2190 2191 /* 2192 * Now put new name in parent dir. 2193 */ 2194 (void) zfs_link_create(dl, zp, tx, ZNEW); 2195 2196 *vpp = ZTOV(zp); 2197 2198 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 2199 if (flags & FIGNORECASE) 2200 txtype |= TX_CI; 2201 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 2202 acl_ids.z_fuidp, vap); 2203 2204 zfs_acl_ids_free(&acl_ids); 2205 2206 dmu_tx_commit(tx); 2207 2208 zfs_dirent_unlock(dl); 2209 2210 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2211 zil_commit(zilog, 0); 2212 2213 ZFS_EXIT(zfsvfs); 2214 return (0); 2215 } 2216 2217 /* 2218 * Remove a directory subdir entry. If the current working 2219 * directory is the same as the subdir to be removed, the 2220 * remove will fail. 2221 * 2222 * IN: dvp - vnode of directory to remove from. 2223 * name - name of directory to be removed. 2224 * cwd - vnode of current working directory. 2225 * cr - credentials of caller. 2226 * ct - caller context 2227 * flags - case flags 2228 * 2229 * RETURN: 0 on success, error code on failure. 2230 * 2231 * Timestamps: 2232 * dvp - ctime|mtime updated 2233 */ 2234 /*ARGSUSED*/ 2235 static int 2236 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 2237 caller_context_t *ct, int flags) 2238 { 2239 znode_t *dzp = VTOZ(dvp); 2240 znode_t *zp; 2241 vnode_t *vp; 2242 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2243 zilog_t *zilog; 2244 zfs_dirlock_t *dl; 2245 dmu_tx_t *tx; 2246 int error; 2247 int zflg = ZEXISTS; 2248 boolean_t waited = B_FALSE; 2249 2250 ZFS_ENTER(zfsvfs); 2251 ZFS_VERIFY_ZP(dzp); 2252 zilog = zfsvfs->z_log; 2253 2254 if (flags & FIGNORECASE) 2255 zflg |= ZCILOOK; 2256 top: 2257 zp = NULL; 2258 2259 /* 2260 * Attempt to lock directory; fail if entry doesn't exist. 2261 */ 2262 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 2263 NULL, NULL)) { 2264 ZFS_EXIT(zfsvfs); 2265 return (error); 2266 } 2267 2268 vp = ZTOV(zp); 2269 2270 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2271 goto out; 2272 } 2273 2274 if (vp->v_type != VDIR) { 2275 error = SET_ERROR(ENOTDIR); 2276 goto out; 2277 } 2278 2279 if (vp == cwd) { 2280 error = SET_ERROR(EINVAL); 2281 goto out; 2282 } 2283 2284 vnevent_rmdir(vp, dvp, name, ct); 2285 2286 /* 2287 * Grab a lock on the directory to make sure that noone is 2288 * trying to add (or lookup) entries while we are removing it. 2289 */ 2290 rw_enter(&zp->z_name_lock, RW_WRITER); 2291 2292 /* 2293 * Grab a lock on the parent pointer to make sure we play well 2294 * with the treewalk and directory rename code. 2295 */ 2296 rw_enter(&zp->z_parent_lock, RW_WRITER); 2297 2298 tx = dmu_tx_create(zfsvfs->z_os); 2299 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2300 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2301 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2302 zfs_sa_upgrade_txholds(tx, zp); 2303 zfs_sa_upgrade_txholds(tx, dzp); 2304 dmu_tx_mark_netfree(tx); 2305 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 2306 if (error) { 2307 rw_exit(&zp->z_parent_lock); 2308 rw_exit(&zp->z_name_lock); 2309 zfs_dirent_unlock(dl); 2310 VN_RELE(vp); 2311 if (error == ERESTART) { 2312 waited = B_TRUE; 2313 dmu_tx_wait(tx); 2314 dmu_tx_abort(tx); 2315 goto top; 2316 } 2317 dmu_tx_abort(tx); 2318 ZFS_EXIT(zfsvfs); 2319 return (error); 2320 } 2321 2322 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 2323 2324 if (error == 0) { 2325 uint64_t txtype = TX_RMDIR; 2326 if (flags & FIGNORECASE) 2327 txtype |= TX_CI; 2328 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 2329 B_FALSE); 2330 } 2331 2332 dmu_tx_commit(tx); 2333 2334 rw_exit(&zp->z_parent_lock); 2335 rw_exit(&zp->z_name_lock); 2336 out: 2337 zfs_dirent_unlock(dl); 2338 2339 VN_RELE(vp); 2340 2341 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2342 zil_commit(zilog, 0); 2343 2344 ZFS_EXIT(zfsvfs); 2345 return (error); 2346 } 2347 2348 /* 2349 * Read as many directory entries as will fit into the provided 2350 * buffer from the given directory cursor position (specified in 2351 * the uio structure). 2352 * 2353 * IN: vp - vnode of directory to read. 2354 * uio - structure supplying read location, range info, 2355 * and return buffer. 2356 * cr - credentials of caller. 2357 * ct - caller context 2358 * flags - case flags 2359 * 2360 * OUT: uio - updated offset and range, buffer filled. 2361 * eofp - set to true if end-of-file detected. 2362 * 2363 * RETURN: 0 on success, error code on failure. 2364 * 2365 * Timestamps: 2366 * vp - atime updated 2367 * 2368 * Note that the low 4 bits of the cookie returned by zap is always zero. 2369 * This allows us to use the low range for "special" directory entries: 2370 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2371 * we use the offset 2 for the '.zfs' directory. 2372 */ 2373 /* ARGSUSED */ 2374 static int 2375 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, 2376 caller_context_t *ct, int flags) 2377 { 2378 znode_t *zp = VTOZ(vp); 2379 iovec_t *iovp; 2380 edirent_t *eodp; 2381 dirent64_t *odp; 2382 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2383 objset_t *os; 2384 caddr_t outbuf; 2385 size_t bufsize; 2386 zap_cursor_t zc; 2387 zap_attribute_t zap; 2388 uint_t bytes_wanted; 2389 uint64_t offset; /* must be unsigned; checks for < 1 */ 2390 uint64_t parent; 2391 int local_eof; 2392 int outcount; 2393 int error; 2394 uint8_t prefetch; 2395 boolean_t check_sysattrs; 2396 2397 ZFS_ENTER(zfsvfs); 2398 ZFS_VERIFY_ZP(zp); 2399 2400 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2401 &parent, sizeof (parent))) != 0) { 2402 ZFS_EXIT(zfsvfs); 2403 return (error); 2404 } 2405 2406 /* 2407 * If we are not given an eof variable, 2408 * use a local one. 2409 */ 2410 if (eofp == NULL) 2411 eofp = &local_eof; 2412 2413 /* 2414 * Check for valid iov_len. 2415 */ 2416 if (uio->uio_iov->iov_len <= 0) { 2417 ZFS_EXIT(zfsvfs); 2418 return (SET_ERROR(EINVAL)); 2419 } 2420 2421 /* 2422 * Quit if directory has been removed (posix) 2423 */ 2424 if ((*eofp = zp->z_unlinked) != 0) { 2425 ZFS_EXIT(zfsvfs); 2426 return (0); 2427 } 2428 2429 error = 0; 2430 os = zfsvfs->z_os; 2431 offset = uio->uio_loffset; 2432 prefetch = zp->z_zn_prefetch; 2433 2434 /* 2435 * Initialize the iterator cursor. 2436 */ 2437 if (offset <= 3) { 2438 /* 2439 * Start iteration from the beginning of the directory. 2440 */ 2441 zap_cursor_init(&zc, os, zp->z_id); 2442 } else { 2443 /* 2444 * The offset is a serialized cursor. 2445 */ 2446 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2447 } 2448 2449 /* 2450 * Get space to change directory entries into fs independent format. 2451 */ 2452 iovp = uio->uio_iov; 2453 bytes_wanted = iovp->iov_len; 2454 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2455 bufsize = bytes_wanted; 2456 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2457 odp = (struct dirent64 *)outbuf; 2458 } else { 2459 bufsize = bytes_wanted; 2460 outbuf = NULL; 2461 odp = (struct dirent64 *)iovp->iov_base; 2462 } 2463 eodp = (struct edirent *)odp; 2464 2465 /* 2466 * If this VFS supports the system attribute view interface; and 2467 * we're looking at an extended attribute directory; and we care 2468 * about normalization conflicts on this vfs; then we must check 2469 * for normalization conflicts with the sysattr name space. 2470 */ 2471 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2472 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2473 (flags & V_RDDIR_ENTFLAGS); 2474 2475 /* 2476 * Transform to file-system independent format 2477 */ 2478 outcount = 0; 2479 while (outcount < bytes_wanted) { 2480 ino64_t objnum; 2481 ushort_t reclen; 2482 off64_t *next = NULL; 2483 2484 /* 2485 * Special case `.', `..', and `.zfs'. 2486 */ 2487 if (offset == 0) { 2488 (void) strcpy(zap.za_name, "."); 2489 zap.za_normalization_conflict = 0; 2490 objnum = zp->z_id; 2491 } else if (offset == 1) { 2492 (void) strcpy(zap.za_name, ".."); 2493 zap.za_normalization_conflict = 0; 2494 objnum = parent; 2495 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2496 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2497 zap.za_normalization_conflict = 0; 2498 objnum = ZFSCTL_INO_ROOT; 2499 } else { 2500 /* 2501 * Grab next entry. 2502 */ 2503 if (error = zap_cursor_retrieve(&zc, &zap)) { 2504 if ((*eofp = (error == ENOENT)) != 0) 2505 break; 2506 else 2507 goto update; 2508 } 2509 2510 if (zap.za_integer_length != 8 || 2511 zap.za_num_integers != 1) { 2512 cmn_err(CE_WARN, "zap_readdir: bad directory " 2513 "entry, obj = %lld, offset = %lld\n", 2514 (u_longlong_t)zp->z_id, 2515 (u_longlong_t)offset); 2516 error = SET_ERROR(ENXIO); 2517 goto update; 2518 } 2519 2520 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2521 /* 2522 * MacOS X can extract the object type here such as: 2523 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2524 */ 2525 2526 if (check_sysattrs && !zap.za_normalization_conflict) { 2527 zap.za_normalization_conflict = 2528 xattr_sysattr_casechk(zap.za_name); 2529 } 2530 } 2531 2532 if (flags & V_RDDIR_ACCFILTER) { 2533 /* 2534 * If we have no access at all, don't include 2535 * this entry in the returned information 2536 */ 2537 znode_t *ezp; 2538 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2539 goto skip_entry; 2540 if (!zfs_has_access(ezp, cr)) { 2541 VN_RELE(ZTOV(ezp)); 2542 goto skip_entry; 2543 } 2544 VN_RELE(ZTOV(ezp)); 2545 } 2546 2547 if (flags & V_RDDIR_ENTFLAGS) 2548 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2549 else 2550 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2551 2552 /* 2553 * Will this entry fit in the buffer? 2554 */ 2555 if (outcount + reclen > bufsize) { 2556 /* 2557 * Did we manage to fit anything in the buffer? 2558 */ 2559 if (!outcount) { 2560 error = SET_ERROR(EINVAL); 2561 goto update; 2562 } 2563 break; 2564 } 2565 if (flags & V_RDDIR_ENTFLAGS) { 2566 /* 2567 * Add extended flag entry: 2568 */ 2569 eodp->ed_ino = objnum; 2570 eodp->ed_reclen = reclen; 2571 /* NOTE: ed_off is the offset for the *next* entry */ 2572 next = &(eodp->ed_off); 2573 eodp->ed_eflags = zap.za_normalization_conflict ? 2574 ED_CASE_CONFLICT : 0; 2575 (void) strncpy(eodp->ed_name, zap.za_name, 2576 EDIRENT_NAMELEN(reclen)); 2577 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2578 } else { 2579 /* 2580 * Add normal entry: 2581 */ 2582 odp->d_ino = objnum; 2583 odp->d_reclen = reclen; 2584 /* NOTE: d_off is the offset for the *next* entry */ 2585 next = &(odp->d_off); 2586 (void) strncpy(odp->d_name, zap.za_name, 2587 DIRENT64_NAMELEN(reclen)); 2588 odp = (dirent64_t *)((intptr_t)odp + reclen); 2589 } 2590 outcount += reclen; 2591 2592 ASSERT(outcount <= bufsize); 2593 2594 /* Prefetch znode */ 2595 if (prefetch) 2596 dmu_prefetch(os, objnum, 0, 0, 0, 2597 ZIO_PRIORITY_SYNC_READ); 2598 2599 skip_entry: 2600 /* 2601 * Move to the next entry, fill in the previous offset. 2602 */ 2603 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2604 zap_cursor_advance(&zc); 2605 offset = zap_cursor_serialize(&zc); 2606 } else { 2607 offset += 1; 2608 } 2609 if (next) 2610 *next = offset; 2611 } 2612 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2613 2614 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2615 iovp->iov_base += outcount; 2616 iovp->iov_len -= outcount; 2617 uio->uio_resid -= outcount; 2618 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2619 /* 2620 * Reset the pointer. 2621 */ 2622 offset = uio->uio_loffset; 2623 } 2624 2625 update: 2626 zap_cursor_fini(&zc); 2627 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2628 kmem_free(outbuf, bufsize); 2629 2630 if (error == ENOENT) 2631 error = 0; 2632 2633 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2634 2635 uio->uio_loffset = offset; 2636 ZFS_EXIT(zfsvfs); 2637 return (error); 2638 } 2639 2640 ulong_t zfs_fsync_sync_cnt = 4; 2641 2642 static int 2643 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2644 { 2645 znode_t *zp = VTOZ(vp); 2646 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2647 2648 /* 2649 * Regardless of whether this is required for standards conformance, 2650 * this is the logical behavior when fsync() is called on a file with 2651 * dirty pages. We use B_ASYNC since the ZIL transactions are already 2652 * going to be pushed out as part of the zil_commit(). 2653 */ 2654 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2655 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 2656 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); 2657 2658 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2659 2660 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2661 ZFS_ENTER(zfsvfs); 2662 ZFS_VERIFY_ZP(zp); 2663 zil_commit(zfsvfs->z_log, zp->z_id); 2664 ZFS_EXIT(zfsvfs); 2665 } 2666 return (0); 2667 } 2668 2669 2670 /* 2671 * Get the requested file attributes and place them in the provided 2672 * vattr structure. 2673 * 2674 * IN: vp - vnode of file. 2675 * vap - va_mask identifies requested attributes. 2676 * If AT_XVATTR set, then optional attrs are requested 2677 * flags - ATTR_NOACLCHECK (CIFS server context) 2678 * cr - credentials of caller. 2679 * ct - caller context 2680 * 2681 * OUT: vap - attribute values. 2682 * 2683 * RETURN: 0 (always succeeds). 2684 */ 2685 /* ARGSUSED */ 2686 static int 2687 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2688 caller_context_t *ct) 2689 { 2690 znode_t *zp = VTOZ(vp); 2691 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2692 int error = 0; 2693 uint64_t links; 2694 uint64_t mtime[2], ctime[2]; 2695 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2696 xoptattr_t *xoap = NULL; 2697 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2698 sa_bulk_attr_t bulk[2]; 2699 int count = 0; 2700 2701 ZFS_ENTER(zfsvfs); 2702 ZFS_VERIFY_ZP(zp); 2703 2704 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2705 2706 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2707 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2708 2709 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2710 ZFS_EXIT(zfsvfs); 2711 return (error); 2712 } 2713 2714 /* 2715 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2716 * Also, if we are the owner don't bother, since owner should 2717 * always be allowed to read basic attributes of file. 2718 */ 2719 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2720 (vap->va_uid != crgetuid(cr))) { 2721 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2722 skipaclchk, cr)) { 2723 ZFS_EXIT(zfsvfs); 2724 return (error); 2725 } 2726 } 2727 2728 /* 2729 * Return all attributes. It's cheaper to provide the answer 2730 * than to determine whether we were asked the question. 2731 */ 2732 2733 mutex_enter(&zp->z_lock); 2734 vap->va_type = vp->v_type; 2735 vap->va_mode = zp->z_mode & MODEMASK; 2736 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2737 vap->va_nodeid = zp->z_id; 2738 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2739 links = zp->z_links + 1; 2740 else 2741 links = zp->z_links; 2742 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2743 vap->va_size = zp->z_size; 2744 vap->va_rdev = vp->v_rdev; 2745 vap->va_seq = zp->z_seq; 2746 2747 /* 2748 * Add in any requested optional attributes and the create time. 2749 * Also set the corresponding bits in the returned attribute bitmap. 2750 */ 2751 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2752 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2753 xoap->xoa_archive = 2754 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2755 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2756 } 2757 2758 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2759 xoap->xoa_readonly = 2760 ((zp->z_pflags & ZFS_READONLY) != 0); 2761 XVA_SET_RTN(xvap, XAT_READONLY); 2762 } 2763 2764 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2765 xoap->xoa_system = 2766 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2767 XVA_SET_RTN(xvap, XAT_SYSTEM); 2768 } 2769 2770 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2771 xoap->xoa_hidden = 2772 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2773 XVA_SET_RTN(xvap, XAT_HIDDEN); 2774 } 2775 2776 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2777 xoap->xoa_nounlink = 2778 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2779 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2780 } 2781 2782 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2783 xoap->xoa_immutable = 2784 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2785 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2786 } 2787 2788 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2789 xoap->xoa_appendonly = 2790 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2791 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2792 } 2793 2794 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2795 xoap->xoa_nodump = 2796 ((zp->z_pflags & ZFS_NODUMP) != 0); 2797 XVA_SET_RTN(xvap, XAT_NODUMP); 2798 } 2799 2800 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2801 xoap->xoa_opaque = 2802 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2803 XVA_SET_RTN(xvap, XAT_OPAQUE); 2804 } 2805 2806 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2807 xoap->xoa_av_quarantined = 2808 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2809 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2810 } 2811 2812 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2813 xoap->xoa_av_modified = 2814 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2815 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2816 } 2817 2818 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2819 vp->v_type == VREG) { 2820 zfs_sa_get_scanstamp(zp, xvap); 2821 } 2822 2823 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2824 uint64_t times[2]; 2825 2826 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 2827 times, sizeof (times)); 2828 ZFS_TIME_DECODE(&xoap->xoa_createtime, times); 2829 XVA_SET_RTN(xvap, XAT_CREATETIME); 2830 } 2831 2832 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2833 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2834 XVA_SET_RTN(xvap, XAT_REPARSE); 2835 } 2836 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2837 xoap->xoa_generation = zp->z_gen; 2838 XVA_SET_RTN(xvap, XAT_GEN); 2839 } 2840 2841 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2842 xoap->xoa_offline = 2843 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2844 XVA_SET_RTN(xvap, XAT_OFFLINE); 2845 } 2846 2847 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2848 xoap->xoa_sparse = 2849 ((zp->z_pflags & ZFS_SPARSE) != 0); 2850 XVA_SET_RTN(xvap, XAT_SPARSE); 2851 } 2852 2853 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2854 xoap->xoa_projinherit = 2855 ((zp->z_pflags & ZFS_PROJINHERIT) != 0); 2856 XVA_SET_RTN(xvap, XAT_PROJINHERIT); 2857 } 2858 2859 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 2860 xoap->xoa_projid = zp->z_projid; 2861 XVA_SET_RTN(xvap, XAT_PROJID); 2862 } 2863 } 2864 2865 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2866 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2867 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2868 2869 mutex_exit(&zp->z_lock); 2870 2871 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); 2872 2873 if (zp->z_blksz == 0) { 2874 /* 2875 * Block size hasn't been set; suggest maximal I/O transfers. 2876 */ 2877 vap->va_blksize = zfsvfs->z_max_blksz; 2878 } 2879 2880 ZFS_EXIT(zfsvfs); 2881 return (0); 2882 } 2883 2884 /* 2885 * For the operation of changing file's user/group/project, we need to 2886 * handle not only the main object that is assigned to the file directly, 2887 * but also the ones that are used by the file via hidden xattr directory. 2888 * 2889 * Because the xattr directory may contain many EA entries, it may be 2890 * impossible to change all of them in the same transaction as changing the 2891 * main object's user/group/project attributes. If so, we have to change them 2892 * via other multiple independent transactions one by one. It may be not a good 2893 * solution, but we have no better idea yet. 2894 */ 2895 static int 2896 zfs_setattr_dir(znode_t *dzp) 2897 { 2898 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2899 objset_t *os = zfsvfs->z_os; 2900 zap_cursor_t zc; 2901 zap_attribute_t zap; 2902 zfs_dirlock_t *dl; 2903 znode_t *zp = NULL; 2904 dmu_tx_t *tx = NULL; 2905 sa_bulk_attr_t bulk[4]; 2906 int count; 2907 int err; 2908 2909 zap_cursor_init(&zc, os, dzp->z_id); 2910 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { 2911 count = 0; 2912 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { 2913 err = ENXIO; 2914 break; 2915 } 2916 2917 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, 2918 ZEXISTS, NULL, NULL); 2919 if (err == ENOENT) 2920 goto next; 2921 if (err) 2922 break; 2923 2924 if (zp->z_uid == dzp->z_uid && 2925 zp->z_gid == dzp->z_gid && 2926 zp->z_projid == dzp->z_projid) 2927 goto next; 2928 2929 tx = dmu_tx_create(os); 2930 if (!(zp->z_pflags & ZFS_PROJID)) 2931 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2932 else 2933 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2934 2935 err = dmu_tx_assign(tx, TXG_WAIT); 2936 if (err) 2937 break; 2938 2939 mutex_enter(&dzp->z_lock); 2940 2941 if (zp->z_uid != dzp->z_uid) { 2942 zp->z_uid = dzp->z_uid; 2943 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2944 &dzp->z_uid, sizeof (dzp->z_uid)); 2945 } 2946 2947 if (zp->z_gid != dzp->z_gid) { 2948 zp->z_gid = dzp->z_gid; 2949 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 2950 &dzp->z_gid, sizeof (dzp->z_gid)); 2951 } 2952 2953 if (zp->z_projid != dzp->z_projid) { 2954 if (!(zp->z_pflags & ZFS_PROJID)) { 2955 zp->z_pflags |= ZFS_PROJID; 2956 SA_ADD_BULK_ATTR(bulk, count, 2957 SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 2958 sizeof (zp->z_pflags)); 2959 } 2960 2961 zp->z_projid = dzp->z_projid; 2962 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), 2963 NULL, &zp->z_projid, sizeof (zp->z_projid)); 2964 } 2965 2966 mutex_exit(&dzp->z_lock); 2967 2968 if (likely(count > 0)) { 2969 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2970 dmu_tx_commit(tx); 2971 } else { 2972 dmu_tx_abort(tx); 2973 } 2974 tx = NULL; 2975 if (err != 0 && err != ENOENT) 2976 break; 2977 2978 next: 2979 if (zp) { 2980 VN_RELE(ZTOV(zp)); 2981 zp = NULL; 2982 zfs_dirent_unlock(dl); 2983 } 2984 zap_cursor_advance(&zc); 2985 } 2986 2987 if (tx) 2988 dmu_tx_abort(tx); 2989 if (zp) { 2990 VN_RELE(ZTOV(zp)); 2991 zfs_dirent_unlock(dl); 2992 } 2993 zap_cursor_fini(&zc); 2994 2995 return (err == ENOENT ? 0 : err); 2996 } 2997 2998 /* 2999 * Set the file attributes to the values contained in the 3000 * vattr structure. 3001 * 3002 * IN: vp - vnode of file to be modified. 3003 * vap - new attribute values. 3004 * If AT_XVATTR set, then optional attrs are being set 3005 * flags - ATTR_UTIME set if non-default time values provided. 3006 * - ATTR_NOACLCHECK (CIFS context only). 3007 * cr - credentials of caller. 3008 * ct - caller context 3009 * 3010 * RETURN: 0 on success, error code on failure. 3011 * 3012 * Timestamps: 3013 * vp - ctime updated, mtime updated if size changed. 3014 */ 3015 /* ARGSUSED */ 3016 static int 3017 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 3018 caller_context_t *ct) 3019 { 3020 znode_t *zp = VTOZ(vp); 3021 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3022 objset_t *os = zfsvfs->z_os; 3023 zilog_t *zilog; 3024 dmu_tx_t *tx; 3025 vattr_t oldva; 3026 xvattr_t tmpxvattr; 3027 uint_t mask = vap->va_mask; 3028 uint_t saved_mask = 0; 3029 int trim_mask = 0; 3030 uint64_t new_mode; 3031 uint64_t new_uid, new_gid; 3032 uint64_t xattr_obj; 3033 uint64_t mtime[2], ctime[2]; 3034 uint64_t projid = ZFS_INVALID_PROJID; 3035 znode_t *attrzp; 3036 int need_policy = FALSE; 3037 int err, err2 = 0; 3038 zfs_fuid_info_t *fuidp = NULL; 3039 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 3040 xoptattr_t *xoap; 3041 zfs_acl_t *aclp; 3042 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3043 boolean_t fuid_dirtied = B_FALSE; 3044 boolean_t handle_eadir = B_FALSE; 3045 sa_bulk_attr_t bulk[8], xattr_bulk[8]; 3046 int count = 0, xattr_count = 0; 3047 3048 if (mask == 0) 3049 return (0); 3050 3051 if (mask & AT_NOSET) 3052 return (SET_ERROR(EINVAL)); 3053 3054 ZFS_ENTER(zfsvfs); 3055 ZFS_VERIFY_ZP(zp); 3056 3057 /* 3058 * If this is a xvattr_t, then get a pointer to the structure of 3059 * optional attributes. If this is NULL, then we have a vattr_t. 3060 */ 3061 xoap = xva_getxoptattr(xvap); 3062 if (xoap != NULL && (mask & AT_XVATTR)) { 3063 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 3064 if (!dmu_objset_projectquota_enabled(os) || 3065 (vp->v_type != VREG && vp->v_type != VDIR)) { 3066 ZFS_EXIT(zfsvfs); 3067 return (SET_ERROR(ENOTSUP)); 3068 } 3069 3070 projid = xoap->xoa_projid; 3071 if (unlikely(projid == ZFS_INVALID_PROJID)) { 3072 ZFS_EXIT(zfsvfs); 3073 return (SET_ERROR(EINVAL)); 3074 } 3075 3076 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 3077 projid = ZFS_INVALID_PROJID; 3078 else 3079 need_policy = TRUE; 3080 } 3081 3082 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 3083 (!dmu_objset_projectquota_enabled(os) || 3084 (vp->v_type != VREG && vp->v_type != VDIR))) { 3085 ZFS_EXIT(zfsvfs); 3086 return (SET_ERROR(ENOTSUP)); 3087 } 3088 } 3089 3090 zilog = zfsvfs->z_log; 3091 3092 /* 3093 * Make sure that if we have ephemeral uid/gid or xvattr specified 3094 * that file system is at proper version level 3095 */ 3096 3097 if (zfsvfs->z_use_fuids == B_FALSE && 3098 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 3099 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 3100 (mask & AT_XVATTR))) { 3101 ZFS_EXIT(zfsvfs); 3102 return (SET_ERROR(EINVAL)); 3103 } 3104 3105 if (mask & AT_SIZE && vp->v_type == VDIR) { 3106 ZFS_EXIT(zfsvfs); 3107 return (SET_ERROR(EISDIR)); 3108 } 3109 3110 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 3111 ZFS_EXIT(zfsvfs); 3112 return (SET_ERROR(EINVAL)); 3113 } 3114 3115 xva_init(&tmpxvattr); 3116 3117 /* 3118 * Immutable files can only alter immutable bit and atime 3119 */ 3120 if ((zp->z_pflags & ZFS_IMMUTABLE) && 3121 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 3122 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 3123 ZFS_EXIT(zfsvfs); 3124 return (SET_ERROR(EPERM)); 3125 } 3126 3127 /* 3128 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 3129 */ 3130 3131 /* 3132 * Verify timestamps doesn't overflow 32 bits. 3133 * ZFS can handle large timestamps, but 32bit syscalls can't 3134 * handle times greater than 2039. This check should be removed 3135 * once large timestamps are fully supported. 3136 */ 3137 if (mask & (AT_ATIME | AT_MTIME)) { 3138 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 3139 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 3140 ZFS_EXIT(zfsvfs); 3141 return (SET_ERROR(EOVERFLOW)); 3142 } 3143 } 3144 3145 top: 3146 attrzp = NULL; 3147 aclp = NULL; 3148 3149 /* Can this be moved to before the top label? */ 3150 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 3151 ZFS_EXIT(zfsvfs); 3152 return (SET_ERROR(EROFS)); 3153 } 3154 3155 /* 3156 * First validate permissions 3157 */ 3158 3159 if (mask & AT_SIZE) { 3160 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 3161 if (err) { 3162 ZFS_EXIT(zfsvfs); 3163 return (err); 3164 } 3165 /* 3166 * XXX - Note, we are not providing any open 3167 * mode flags here (like FNDELAY), so we may 3168 * block if there are locks present... this 3169 * should be addressed in openat(). 3170 */ 3171 /* XXX - would it be OK to generate a log record here? */ 3172 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 3173 if (err) { 3174 ZFS_EXIT(zfsvfs); 3175 return (err); 3176 } 3177 3178 if (vap->va_size == 0) 3179 vnevent_truncate(ZTOV(zp), ct); 3180 } 3181 3182 if (mask & (AT_ATIME|AT_MTIME) || 3183 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 3184 XVA_ISSET_REQ(xvap, XAT_READONLY) || 3185 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 3186 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 3187 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 3188 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 3189 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 3190 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 3191 skipaclchk, cr); 3192 } 3193 3194 if (mask & (AT_UID|AT_GID)) { 3195 int idmask = (mask & (AT_UID|AT_GID)); 3196 int take_owner; 3197 int take_group; 3198 3199 /* 3200 * NOTE: even if a new mode is being set, 3201 * we may clear S_ISUID/S_ISGID bits. 3202 */ 3203 3204 if (!(mask & AT_MODE)) 3205 vap->va_mode = zp->z_mode; 3206 3207 /* 3208 * Take ownership or chgrp to group we are a member of 3209 */ 3210 3211 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3212 take_group = (mask & AT_GID) && 3213 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3214 3215 /* 3216 * If both AT_UID and AT_GID are set then take_owner and 3217 * take_group must both be set in order to allow taking 3218 * ownership. 3219 * 3220 * Otherwise, send the check through secpolicy_vnode_setattr() 3221 * 3222 */ 3223 3224 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3225 ((idmask == AT_UID) && take_owner) || 3226 ((idmask == AT_GID) && take_group)) { 3227 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3228 skipaclchk, cr) == 0) { 3229 /* 3230 * Remove setuid/setgid for non-privileged users 3231 */ 3232 secpolicy_setid_clear(vap, cr); 3233 trim_mask = (mask & (AT_UID|AT_GID)); 3234 } else { 3235 need_policy = TRUE; 3236 } 3237 } else { 3238 need_policy = TRUE; 3239 } 3240 } 3241 3242 mutex_enter(&zp->z_lock); 3243 oldva.va_mode = zp->z_mode; 3244 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3245 if (mask & AT_XVATTR) { 3246 /* 3247 * Update xvattr mask to include only those attributes 3248 * that are actually changing. 3249 * 3250 * the bits will be restored prior to actually setting 3251 * the attributes so the caller thinks they were set. 3252 */ 3253 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3254 if (xoap->xoa_appendonly != 3255 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3256 need_policy = TRUE; 3257 } else { 3258 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3259 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3260 } 3261 } 3262 3263 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 3264 if (xoap->xoa_projinherit != 3265 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 3266 need_policy = TRUE; 3267 } else { 3268 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 3269 XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); 3270 } 3271 } 3272 3273 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3274 if (xoap->xoa_nounlink != 3275 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3276 need_policy = TRUE; 3277 } else { 3278 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3279 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3280 } 3281 } 3282 3283 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3284 if (xoap->xoa_immutable != 3285 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3286 need_policy = TRUE; 3287 } else { 3288 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3289 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3290 } 3291 } 3292 3293 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3294 if (xoap->xoa_nodump != 3295 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3296 need_policy = TRUE; 3297 } else { 3298 XVA_CLR_REQ(xvap, XAT_NODUMP); 3299 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3300 } 3301 } 3302 3303 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3304 if (xoap->xoa_av_modified != 3305 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3306 need_policy = TRUE; 3307 } else { 3308 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3309 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3310 } 3311 } 3312 3313 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3314 if ((vp->v_type != VREG && 3315 xoap->xoa_av_quarantined) || 3316 xoap->xoa_av_quarantined != 3317 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3318 need_policy = TRUE; 3319 } else { 3320 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3321 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3322 } 3323 } 3324 3325 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3326 mutex_exit(&zp->z_lock); 3327 ZFS_EXIT(zfsvfs); 3328 return (SET_ERROR(EPERM)); 3329 } 3330 3331 if (need_policy == FALSE && 3332 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3333 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3334 need_policy = TRUE; 3335 } 3336 } 3337 3338 mutex_exit(&zp->z_lock); 3339 3340 if (mask & AT_MODE) { 3341 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3342 err = secpolicy_setid_setsticky_clear(vp, vap, 3343 &oldva, cr); 3344 if (err) { 3345 ZFS_EXIT(zfsvfs); 3346 return (err); 3347 } 3348 trim_mask |= AT_MODE; 3349 } else { 3350 need_policy = TRUE; 3351 } 3352 } 3353 3354 if (need_policy) { 3355 /* 3356 * If trim_mask is set then take ownership 3357 * has been granted or write_acl is present and user 3358 * has the ability to modify mode. In that case remove 3359 * UID|GID and or MODE from mask so that 3360 * secpolicy_vnode_setattr() doesn't revoke it. 3361 * If acl_implicit (implicit owner rights) is false, 3362 * tell secpolicy about that via the flags. 3363 */ 3364 3365 if (zfsvfs->z_acl_implicit == B_FALSE) 3366 flags |= ATTR_NOIMPLICIT; 3367 if (trim_mask) { 3368 saved_mask = vap->va_mask; 3369 vap->va_mask &= ~trim_mask; 3370 } 3371 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3372 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3373 if (err) { 3374 ZFS_EXIT(zfsvfs); 3375 return (err); 3376 } 3377 3378 if (trim_mask) 3379 vap->va_mask |= saved_mask; 3380 } 3381 3382 /* 3383 * secpolicy_vnode_setattr, or take ownership may have 3384 * changed va_mask 3385 */ 3386 mask = vap->va_mask; 3387 3388 if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { 3389 handle_eadir = B_TRUE; 3390 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3391 &xattr_obj, sizeof (xattr_obj)); 3392 3393 if (err == 0 && xattr_obj) { 3394 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3395 if (err) 3396 goto out2; 3397 } 3398 if (mask & AT_UID) { 3399 new_uid = zfs_fuid_create(zfsvfs, 3400 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3401 if (new_uid != zp->z_uid && 3402 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 3403 new_uid)) { 3404 if (attrzp) 3405 VN_RELE(ZTOV(attrzp)); 3406 err = SET_ERROR(EDQUOT); 3407 goto out2; 3408 } 3409 } 3410 3411 if (mask & AT_GID) { 3412 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3413 cr, ZFS_GROUP, &fuidp); 3414 if (new_gid != zp->z_gid && 3415 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3416 new_gid)) { 3417 if (attrzp) 3418 VN_RELE(ZTOV(attrzp)); 3419 err = SET_ERROR(EDQUOT); 3420 goto out2; 3421 } 3422 } 3423 3424 if (projid != ZFS_INVALID_PROJID && 3425 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 3426 if (attrzp) 3427 VN_RELE(ZTOV(attrzp)); 3428 err = EDQUOT; 3429 goto out2; 3430 } 3431 } 3432 tx = dmu_tx_create(os); 3433 3434 if (mask & AT_MODE) { 3435 uint64_t pmode = zp->z_mode; 3436 uint64_t acl_obj; 3437 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3438 3439 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3440 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3441 err = SET_ERROR(EPERM); 3442 goto out; 3443 } 3444 3445 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3446 goto out; 3447 3448 mutex_enter(&zp->z_lock); 3449 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3450 /* 3451 * Are we upgrading ACL from old V0 format 3452 * to V1 format? 3453 */ 3454 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3455 zfs_znode_acl_version(zp) == 3456 ZFS_ACL_VERSION_INITIAL) { 3457 dmu_tx_hold_free(tx, acl_obj, 0, 3458 DMU_OBJECT_END); 3459 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3460 0, aclp->z_acl_bytes); 3461 } else { 3462 dmu_tx_hold_write(tx, acl_obj, 0, 3463 aclp->z_acl_bytes); 3464 } 3465 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3466 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3467 0, aclp->z_acl_bytes); 3468 } 3469 mutex_exit(&zp->z_lock); 3470 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3471 } else { 3472 if (((mask & AT_XVATTR) && 3473 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 3474 (projid != ZFS_INVALID_PROJID && 3475 !(zp->z_pflags & ZFS_PROJID))) 3476 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3477 else 3478 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3479 } 3480 3481 if (attrzp) { 3482 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3483 } 3484 3485 fuid_dirtied = zfsvfs->z_fuid_dirty; 3486 if (fuid_dirtied) 3487 zfs_fuid_txhold(zfsvfs, tx); 3488 3489 zfs_sa_upgrade_txholds(tx, zp); 3490 3491 err = dmu_tx_assign(tx, TXG_WAIT); 3492 if (err) 3493 goto out; 3494 3495 count = 0; 3496 /* 3497 * Set each attribute requested. 3498 * We group settings according to the locks they need to acquire. 3499 * 3500 * Note: you cannot set ctime directly, although it will be 3501 * updated as a side-effect of calling this function. 3502 */ 3503 3504 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 3505 /* 3506 * For the existing object that is upgraded from old system, 3507 * its on-disk layout has no slot for the project ID attribute. 3508 * But quota accounting logic needs to access related slots by 3509 * offset directly. So we need to adjust old objects' layout 3510 * to make the project ID to some unified and fixed offset. 3511 */ 3512 if (attrzp) 3513 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 3514 if (err == 0) 3515 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 3516 3517 if (unlikely(err == EEXIST)) 3518 err = 0; 3519 else if (err != 0) 3520 goto out; 3521 else 3522 projid = ZFS_INVALID_PROJID; 3523 } 3524 3525 if (mask & (AT_UID|AT_GID|AT_MODE)) 3526 mutex_enter(&zp->z_acl_lock); 3527 mutex_enter(&zp->z_lock); 3528 3529 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3530 &zp->z_pflags, sizeof (zp->z_pflags)); 3531 3532 if (attrzp) { 3533 if (mask & (AT_UID|AT_GID|AT_MODE)) 3534 mutex_enter(&attrzp->z_acl_lock); 3535 mutex_enter(&attrzp->z_lock); 3536 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3537 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3538 sizeof (attrzp->z_pflags)); 3539 if (projid != ZFS_INVALID_PROJID) { 3540 attrzp->z_projid = projid; 3541 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3542 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 3543 sizeof (attrzp->z_projid)); 3544 } 3545 } 3546 3547 if (mask & (AT_UID|AT_GID)) { 3548 3549 if (mask & AT_UID) { 3550 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3551 &new_uid, sizeof (new_uid)); 3552 zp->z_uid = new_uid; 3553 if (attrzp) { 3554 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3555 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3556 sizeof (new_uid)); 3557 attrzp->z_uid = new_uid; 3558 } 3559 } 3560 3561 if (mask & AT_GID) { 3562 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3563 NULL, &new_gid, sizeof (new_gid)); 3564 zp->z_gid = new_gid; 3565 if (attrzp) { 3566 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3567 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3568 sizeof (new_gid)); 3569 attrzp->z_gid = new_gid; 3570 } 3571 } 3572 if (!(mask & AT_MODE)) { 3573 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3574 NULL, &new_mode, sizeof (new_mode)); 3575 new_mode = zp->z_mode; 3576 } 3577 err = zfs_acl_chown_setattr(zp); 3578 ASSERT(err == 0); 3579 if (attrzp) { 3580 err = zfs_acl_chown_setattr(attrzp); 3581 ASSERT(err == 0); 3582 } 3583 } 3584 3585 if (mask & AT_MODE) { 3586 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3587 &new_mode, sizeof (new_mode)); 3588 zp->z_mode = new_mode; 3589 ASSERT3U((uintptr_t)aclp, !=, NULL); 3590 err = zfs_aclset_common(zp, aclp, cr, tx); 3591 ASSERT0(err); 3592 if (zp->z_acl_cached) 3593 zfs_acl_free(zp->z_acl_cached); 3594 zp->z_acl_cached = aclp; 3595 aclp = NULL; 3596 } 3597 3598 3599 if (mask & AT_ATIME) { 3600 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3601 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3602 &zp->z_atime, sizeof (zp->z_atime)); 3603 } 3604 3605 if (mask & AT_MTIME) { 3606 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3607 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3608 mtime, sizeof (mtime)); 3609 } 3610 3611 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3612 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3613 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3614 NULL, mtime, sizeof (mtime)); 3615 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3616 &ctime, sizeof (ctime)); 3617 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3618 B_TRUE); 3619 } else if (mask != 0) { 3620 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3621 &ctime, sizeof (ctime)); 3622 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3623 B_TRUE); 3624 if (attrzp) { 3625 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3626 SA_ZPL_CTIME(zfsvfs), NULL, 3627 &ctime, sizeof (ctime)); 3628 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3629 mtime, ctime, B_TRUE); 3630 } 3631 } 3632 3633 if (projid != ZFS_INVALID_PROJID) { 3634 zp->z_projid = projid; 3635 SA_ADD_BULK_ATTR(bulk, count, 3636 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 3637 sizeof (zp->z_projid)); 3638 } 3639 3640 /* 3641 * Do this after setting timestamps to prevent timestamp 3642 * update from toggling bit 3643 */ 3644 3645 if (xoap && (mask & AT_XVATTR)) { 3646 3647 /* 3648 * restore trimmed off masks 3649 * so that return masks can be set for caller. 3650 */ 3651 3652 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3653 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3654 } 3655 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3656 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3657 } 3658 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3659 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3660 } 3661 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3662 XVA_SET_REQ(xvap, XAT_NODUMP); 3663 } 3664 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3665 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3666 } 3667 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3668 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3669 } 3670 if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { 3671 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 3672 } 3673 3674 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3675 ASSERT(vp->v_type == VREG); 3676 3677 zfs_xvattr_set(zp, xvap, tx); 3678 } 3679 3680 if (fuid_dirtied) 3681 zfs_fuid_sync(zfsvfs, tx); 3682 3683 if (mask != 0) 3684 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3685 3686 mutex_exit(&zp->z_lock); 3687 if (mask & (AT_UID|AT_GID|AT_MODE)) 3688 mutex_exit(&zp->z_acl_lock); 3689 3690 if (attrzp) { 3691 if (mask & (AT_UID|AT_GID|AT_MODE)) 3692 mutex_exit(&attrzp->z_acl_lock); 3693 mutex_exit(&attrzp->z_lock); 3694 } 3695 out: 3696 if (err == 0 && xattr_count > 0) { 3697 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3698 xattr_count, tx); 3699 ASSERT(err2 == 0); 3700 } 3701 3702 if (aclp) 3703 zfs_acl_free(aclp); 3704 3705 if (fuidp) { 3706 zfs_fuid_info_free(fuidp); 3707 fuidp = NULL; 3708 } 3709 3710 if (err) { 3711 dmu_tx_abort(tx); 3712 if (attrzp) 3713 VN_RELE(ZTOV(attrzp)); 3714 if (err == ERESTART) 3715 goto top; 3716 } else { 3717 if (count > 0) 3718 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3719 dmu_tx_commit(tx); 3720 if (attrzp) { 3721 if (err2 == 0 && handle_eadir) 3722 err2 = zfs_setattr_dir(attrzp); 3723 VN_RELE(ZTOV(attrzp)); 3724 } 3725 } 3726 3727 out2: 3728 if (os->os_sync == ZFS_SYNC_ALWAYS) 3729 zil_commit(zilog, 0); 3730 3731 ZFS_EXIT(zfsvfs); 3732 return (err); 3733 } 3734 3735 typedef struct zfs_zlock { 3736 krwlock_t *zl_rwlock; /* lock we acquired */ 3737 znode_t *zl_znode; /* znode we held */ 3738 struct zfs_zlock *zl_next; /* next in list */ 3739 } zfs_zlock_t; 3740 3741 /* 3742 * Drop locks and release vnodes that were held by zfs_rename_lock(). 3743 */ 3744 static void 3745 zfs_rename_unlock(zfs_zlock_t **zlpp) 3746 { 3747 zfs_zlock_t *zl; 3748 3749 while ((zl = *zlpp) != NULL) { 3750 if (zl->zl_znode != NULL) 3751 VN_RELE(ZTOV(zl->zl_znode)); 3752 rw_exit(zl->zl_rwlock); 3753 *zlpp = zl->zl_next; 3754 kmem_free(zl, sizeof (*zl)); 3755 } 3756 } 3757 3758 /* 3759 * Search back through the directory tree, using the ".." entries. 3760 * Lock each directory in the chain to prevent concurrent renames. 3761 * Fail any attempt to move a directory into one of its own descendants. 3762 * XXX - z_parent_lock can overlap with map or grow locks 3763 */ 3764 static int 3765 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3766 { 3767 zfs_zlock_t *zl; 3768 znode_t *zp = tdzp; 3769 uint64_t rootid = zp->z_zfsvfs->z_root; 3770 uint64_t oidp = zp->z_id; 3771 krwlock_t *rwlp = &szp->z_parent_lock; 3772 krw_t rw = RW_WRITER; 3773 3774 /* 3775 * First pass write-locks szp and compares to zp->z_id. 3776 * Later passes read-lock zp and compare to zp->z_parent. 3777 */ 3778 do { 3779 if (!rw_tryenter(rwlp, rw)) { 3780 /* 3781 * Another thread is renaming in this path. 3782 * Note that if we are a WRITER, we don't have any 3783 * parent_locks held yet. 3784 */ 3785 if (rw == RW_READER && zp->z_id > szp->z_id) { 3786 /* 3787 * Drop our locks and restart 3788 */ 3789 zfs_rename_unlock(&zl); 3790 *zlpp = NULL; 3791 zp = tdzp; 3792 oidp = zp->z_id; 3793 rwlp = &szp->z_parent_lock; 3794 rw = RW_WRITER; 3795 continue; 3796 } else { 3797 /* 3798 * Wait for other thread to drop its locks 3799 */ 3800 rw_enter(rwlp, rw); 3801 } 3802 } 3803 3804 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3805 zl->zl_rwlock = rwlp; 3806 zl->zl_znode = NULL; 3807 zl->zl_next = *zlpp; 3808 *zlpp = zl; 3809 3810 if (oidp == szp->z_id) /* We're a descendant of szp */ 3811 return (SET_ERROR(EINVAL)); 3812 3813 if (oidp == rootid) /* We've hit the top */ 3814 return (0); 3815 3816 if (rw == RW_READER) { /* i.e. not the first pass */ 3817 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); 3818 if (error) 3819 return (error); 3820 zl->zl_znode = zp; 3821 } 3822 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), 3823 &oidp, sizeof (oidp)); 3824 rwlp = &zp->z_parent_lock; 3825 rw = RW_READER; 3826 3827 } while (zp->z_id != sdzp->z_id); 3828 3829 return (0); 3830 } 3831 3832 /* 3833 * Move an entry from the provided source directory to the target 3834 * directory. Change the entry name as indicated. 3835 * 3836 * IN: sdvp - Source directory containing the "old entry". 3837 * snm - Old entry name. 3838 * tdvp - Target directory to contain the "new entry". 3839 * tnm - New entry name. 3840 * cr - credentials of caller. 3841 * ct - caller context 3842 * flags - case flags 3843 * 3844 * RETURN: 0 on success, error code on failure. 3845 * 3846 * Timestamps: 3847 * sdvp,tdvp - ctime|mtime updated 3848 */ 3849 /*ARGSUSED*/ 3850 static int 3851 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3852 caller_context_t *ct, int flags) 3853 { 3854 znode_t *tdzp, *szp, *tzp; 3855 znode_t *sdzp = VTOZ(sdvp); 3856 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3857 zilog_t *zilog; 3858 vnode_t *realvp; 3859 zfs_dirlock_t *sdl, *tdl; 3860 dmu_tx_t *tx; 3861 zfs_zlock_t *zl; 3862 int cmp, serr, terr; 3863 int error = 0, rm_err = 0; 3864 int zflg = 0; 3865 boolean_t waited = B_FALSE; 3866 3867 ZFS_ENTER(zfsvfs); 3868 ZFS_VERIFY_ZP(sdzp); 3869 zilog = zfsvfs->z_log; 3870 3871 /* 3872 * Make sure we have the real vp for the target directory. 3873 */ 3874 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3875 tdvp = realvp; 3876 3877 tdzp = VTOZ(tdvp); 3878 ZFS_VERIFY_ZP(tdzp); 3879 3880 /* 3881 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3882 * ctldir appear to have the same v_vfsp. 3883 */ 3884 if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { 3885 ZFS_EXIT(zfsvfs); 3886 return (SET_ERROR(EXDEV)); 3887 } 3888 3889 if (zfsvfs->z_utf8 && u8_validate(tnm, 3890 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3891 ZFS_EXIT(zfsvfs); 3892 return (SET_ERROR(EILSEQ)); 3893 } 3894 3895 if (flags & FIGNORECASE) 3896 zflg |= ZCILOOK; 3897 3898 top: 3899 szp = NULL; 3900 tzp = NULL; 3901 zl = NULL; 3902 3903 /* 3904 * This is to prevent the creation of links into attribute space 3905 * by renaming a linked file into/outof an attribute directory. 3906 * See the comment in zfs_link() for why this is considered bad. 3907 */ 3908 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3909 ZFS_EXIT(zfsvfs); 3910 return (SET_ERROR(EINVAL)); 3911 } 3912 3913 /* 3914 * Lock source and target directory entries. To prevent deadlock, 3915 * a lock ordering must be defined. We lock the directory with 3916 * the smallest object id first, or if it's a tie, the one with 3917 * the lexically first name. 3918 */ 3919 if (sdzp->z_id < tdzp->z_id) { 3920 cmp = -1; 3921 } else if (sdzp->z_id > tdzp->z_id) { 3922 cmp = 1; 3923 } else { 3924 /* 3925 * First compare the two name arguments without 3926 * considering any case folding. 3927 */ 3928 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3929 3930 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3931 ASSERT(error == 0 || !zfsvfs->z_utf8); 3932 if (cmp == 0) { 3933 /* 3934 * POSIX: "If the old argument and the new argument 3935 * both refer to links to the same existing file, 3936 * the rename() function shall return successfully 3937 * and perform no other action." 3938 */ 3939 ZFS_EXIT(zfsvfs); 3940 return (0); 3941 } 3942 /* 3943 * If the file system is case-folding, then we may 3944 * have some more checking to do. A case-folding file 3945 * system is either supporting mixed case sensitivity 3946 * access or is completely case-insensitive. Note 3947 * that the file system is always case preserving. 3948 * 3949 * In mixed sensitivity mode case sensitive behavior 3950 * is the default. FIGNORECASE must be used to 3951 * explicitly request case insensitive behavior. 3952 * 3953 * If the source and target names provided differ only 3954 * by case (e.g., a request to rename 'tim' to 'Tim'), 3955 * we will treat this as a special case in the 3956 * case-insensitive mode: as long as the source name 3957 * is an exact match, we will allow this to proceed as 3958 * a name-change request. 3959 */ 3960 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3961 (zfsvfs->z_case == ZFS_CASE_MIXED && 3962 flags & FIGNORECASE)) && 3963 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3964 &error) == 0) { 3965 /* 3966 * case preserving rename request, require exact 3967 * name matches 3968 */ 3969 zflg |= ZCIEXACT; 3970 zflg &= ~ZCILOOK; 3971 } 3972 } 3973 3974 /* 3975 * If the source and destination directories are the same, we should 3976 * grab the z_name_lock of that directory only once. 3977 */ 3978 if (sdzp == tdzp) { 3979 zflg |= ZHAVELOCK; 3980 rw_enter(&sdzp->z_name_lock, RW_READER); 3981 } 3982 3983 if (cmp < 0) { 3984 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3985 ZEXISTS | zflg, NULL, NULL); 3986 terr = zfs_dirent_lock(&tdl, 3987 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3988 } else { 3989 terr = zfs_dirent_lock(&tdl, 3990 tdzp, tnm, &tzp, zflg, NULL, NULL); 3991 serr = zfs_dirent_lock(&sdl, 3992 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3993 NULL, NULL); 3994 } 3995 3996 if (serr) { 3997 /* 3998 * Source entry invalid or not there. 3999 */ 4000 if (!terr) { 4001 zfs_dirent_unlock(tdl); 4002 if (tzp) 4003 VN_RELE(ZTOV(tzp)); 4004 } 4005 4006 if (sdzp == tdzp) 4007 rw_exit(&sdzp->z_name_lock); 4008 4009 if (strcmp(snm, "..") == 0) 4010 serr = SET_ERROR(EINVAL); 4011 ZFS_EXIT(zfsvfs); 4012 return (serr); 4013 } 4014 if (terr) { 4015 zfs_dirent_unlock(sdl); 4016 VN_RELE(ZTOV(szp)); 4017 4018 if (sdzp == tdzp) 4019 rw_exit(&sdzp->z_name_lock); 4020 4021 if (strcmp(tnm, "..") == 0) 4022 terr = SET_ERROR(EINVAL); 4023 ZFS_EXIT(zfsvfs); 4024 return (terr); 4025 } 4026 4027 /* 4028 * If we are using project inheritance, it means if the directory has 4029 * ZFS_PROJINHERIT set, then its descendant directories will inherit 4030 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 4031 * such case, we only allow renames into our tree when the project 4032 * IDs are the same. 4033 */ 4034 if (tdzp->z_pflags & ZFS_PROJINHERIT && 4035 tdzp->z_projid != szp->z_projid) { 4036 error = SET_ERROR(EXDEV); 4037 goto out; 4038 } 4039 4040 /* 4041 * Must have write access at the source to remove the old entry 4042 * and write access at the target to create the new entry. 4043 * Note that if target and source are the same, this can be 4044 * done in a single check. 4045 */ 4046 4047 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 4048 goto out; 4049 4050 if (ZTOV(szp)->v_type == VDIR) { 4051 /* 4052 * Check to make sure rename is valid. 4053 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 4054 */ 4055 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 4056 goto out; 4057 } 4058 4059 /* 4060 * Does target exist? 4061 */ 4062 if (tzp) { 4063 /* 4064 * Source and target must be the same type. 4065 */ 4066 if (ZTOV(szp)->v_type == VDIR) { 4067 if (ZTOV(tzp)->v_type != VDIR) { 4068 error = SET_ERROR(ENOTDIR); 4069 goto out; 4070 } 4071 } else { 4072 if (ZTOV(tzp)->v_type == VDIR) { 4073 error = SET_ERROR(EISDIR); 4074 goto out; 4075 } 4076 } 4077 /* 4078 * POSIX dictates that when the source and target 4079 * entries refer to the same file object, rename 4080 * must do nothing and exit without error. 4081 */ 4082 if (szp->z_id == tzp->z_id) { 4083 error = 0; 4084 goto out; 4085 } 4086 } 4087 4088 vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct); 4089 if (tzp) 4090 vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 4091 4092 /* 4093 * notify the target directory if it is not the same 4094 * as source directory. 4095 */ 4096 if (tdvp != sdvp) { 4097 vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); 4098 } 4099 4100 tx = dmu_tx_create(zfsvfs->z_os); 4101 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4102 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 4103 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 4104 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 4105 if (sdzp != tdzp) { 4106 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 4107 zfs_sa_upgrade_txholds(tx, tdzp); 4108 } 4109 if (tzp) { 4110 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 4111 zfs_sa_upgrade_txholds(tx, tzp); 4112 } 4113 4114 zfs_sa_upgrade_txholds(tx, szp); 4115 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 4116 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 4117 if (error) { 4118 if (zl != NULL) 4119 zfs_rename_unlock(&zl); 4120 zfs_dirent_unlock(sdl); 4121 zfs_dirent_unlock(tdl); 4122 4123 if (sdzp == tdzp) 4124 rw_exit(&sdzp->z_name_lock); 4125 4126 VN_RELE(ZTOV(szp)); 4127 if (tzp) 4128 VN_RELE(ZTOV(tzp)); 4129 if (error == ERESTART) { 4130 waited = B_TRUE; 4131 dmu_tx_wait(tx); 4132 dmu_tx_abort(tx); 4133 goto top; 4134 } 4135 dmu_tx_abort(tx); 4136 ZFS_EXIT(zfsvfs); 4137 return (error); 4138 } 4139 4140 if (tzp) /* Attempt to remove the existing target */ 4141 error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 4142 4143 if (error == 0) { 4144 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 4145 if (error == 0) { 4146 szp->z_pflags |= ZFS_AV_MODIFIED; 4147 if (tdzp->z_pflags & ZFS_PROJINHERIT) 4148 szp->z_pflags |= ZFS_PROJINHERIT; 4149 4150 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 4151 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 4152 ASSERT0(error); 4153 4154 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 4155 if (error == 0) { 4156 zfs_log_rename(zilog, tx, TX_RENAME | 4157 (flags & FIGNORECASE ? TX_CI : 0), sdzp, 4158 sdl->dl_name, tdzp, tdl->dl_name, szp); 4159 4160 /* 4161 * Update path information for the target vnode 4162 */ 4163 vn_renamepath(tdvp, ZTOV(szp), tnm, 4164 strlen(tnm)); 4165 } else { 4166 /* 4167 * At this point, we have successfully created 4168 * the target name, but have failed to remove 4169 * the source name. Since the create was done 4170 * with the ZRENAMING flag, there are 4171 * complications; for one, the link count is 4172 * wrong. The easiest way to deal with this 4173 * is to remove the newly created target, and 4174 * return the original error. This must 4175 * succeed; fortunately, it is very unlikely to 4176 * fail, since we just created it. 4177 */ 4178 VERIFY3U(zfs_link_destroy(tdl, szp, tx, 4179 ZRENAMING, NULL), ==, 0); 4180 } 4181 } 4182 } 4183 4184 dmu_tx_commit(tx); 4185 4186 if (tzp && rm_err == 0) 4187 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 4188 4189 if (error == 0) { 4190 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 4191 /* notify the target dir if it is not the same as source dir */ 4192 if (tdvp != sdvp) 4193 vnevent_rename_dest_dir(tdvp, ct); 4194 } 4195 out: 4196 if (zl != NULL) 4197 zfs_rename_unlock(&zl); 4198 4199 zfs_dirent_unlock(sdl); 4200 zfs_dirent_unlock(tdl); 4201 4202 if (sdzp == tdzp) 4203 rw_exit(&sdzp->z_name_lock); 4204 4205 4206 VN_RELE(ZTOV(szp)); 4207 if (tzp) 4208 VN_RELE(ZTOV(tzp)); 4209 4210 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4211 zil_commit(zilog, 0); 4212 4213 ZFS_EXIT(zfsvfs); 4214 return (error); 4215 } 4216 4217 /* 4218 * Insert the indicated symbolic reference entry into the directory. 4219 * 4220 * IN: dvp - Directory to contain new symbolic link. 4221 * link - Name for new symlink entry. 4222 * vap - Attributes of new entry. 4223 * cr - credentials of caller. 4224 * ct - caller context 4225 * flags - case flags 4226 * 4227 * RETURN: 0 on success, error code on failure. 4228 * 4229 * Timestamps: 4230 * dvp - ctime|mtime updated 4231 */ 4232 /*ARGSUSED*/ 4233 static int 4234 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, 4235 caller_context_t *ct, int flags) 4236 { 4237 znode_t *zp, *dzp = VTOZ(dvp); 4238 zfs_dirlock_t *dl; 4239 dmu_tx_t *tx; 4240 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4241 zilog_t *zilog; 4242 uint64_t len = strlen(link); 4243 int error; 4244 int zflg = ZNEW; 4245 zfs_acl_ids_t acl_ids; 4246 boolean_t fuid_dirtied; 4247 uint64_t txtype = TX_SYMLINK; 4248 boolean_t waited = B_FALSE; 4249 4250 ASSERT(vap->va_type == VLNK); 4251 4252 ZFS_ENTER(zfsvfs); 4253 ZFS_VERIFY_ZP(dzp); 4254 zilog = zfsvfs->z_log; 4255 4256 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4257 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4258 ZFS_EXIT(zfsvfs); 4259 return (SET_ERROR(EILSEQ)); 4260 } 4261 if (flags & FIGNORECASE) 4262 zflg |= ZCILOOK; 4263 4264 if (len > MAXPATHLEN) { 4265 ZFS_EXIT(zfsvfs); 4266 return (SET_ERROR(ENAMETOOLONG)); 4267 } 4268 4269 if ((error = zfs_acl_ids_create(dzp, 0, 4270 vap, cr, NULL, &acl_ids)) != 0) { 4271 ZFS_EXIT(zfsvfs); 4272 return (error); 4273 } 4274 top: 4275 /* 4276 * Attempt to lock directory; fail if entry already exists. 4277 */ 4278 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 4279 if (error) { 4280 zfs_acl_ids_free(&acl_ids); 4281 ZFS_EXIT(zfsvfs); 4282 return (error); 4283 } 4284 4285 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4286 zfs_acl_ids_free(&acl_ids); 4287 zfs_dirent_unlock(dl); 4288 ZFS_EXIT(zfsvfs); 4289 return (error); 4290 } 4291 4292 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 4293 zfs_acl_ids_free(&acl_ids); 4294 zfs_dirent_unlock(dl); 4295 ZFS_EXIT(zfsvfs); 4296 return (SET_ERROR(EDQUOT)); 4297 } 4298 tx = dmu_tx_create(zfsvfs->z_os); 4299 fuid_dirtied = zfsvfs->z_fuid_dirty; 4300 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4301 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4302 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4303 ZFS_SA_BASE_ATTR_SIZE + len); 4304 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4305 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4306 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4307 acl_ids.z_aclp->z_acl_bytes); 4308 } 4309 if (fuid_dirtied) 4310 zfs_fuid_txhold(zfsvfs, tx); 4311 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 4312 if (error) { 4313 zfs_dirent_unlock(dl); 4314 if (error == ERESTART) { 4315 waited = B_TRUE; 4316 dmu_tx_wait(tx); 4317 dmu_tx_abort(tx); 4318 goto top; 4319 } 4320 zfs_acl_ids_free(&acl_ids); 4321 dmu_tx_abort(tx); 4322 ZFS_EXIT(zfsvfs); 4323 return (error); 4324 } 4325 4326 /* 4327 * Create a new object for the symlink. 4328 * for version 4 ZPL datsets the symlink will be an SA attribute 4329 */ 4330 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4331 4332 if (fuid_dirtied) 4333 zfs_fuid_sync(zfsvfs, tx); 4334 4335 mutex_enter(&zp->z_lock); 4336 if (zp->z_is_sa) 4337 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4338 link, len, tx); 4339 else 4340 zfs_sa_symlink(zp, link, len, tx); 4341 mutex_exit(&zp->z_lock); 4342 4343 zp->z_size = len; 4344 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4345 &zp->z_size, sizeof (zp->z_size), tx); 4346 /* 4347 * Insert the new object into the directory. 4348 */ 4349 (void) zfs_link_create(dl, zp, tx, ZNEW); 4350 4351 if (flags & FIGNORECASE) 4352 txtype |= TX_CI; 4353 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4354 4355 zfs_acl_ids_free(&acl_ids); 4356 4357 dmu_tx_commit(tx); 4358 4359 zfs_dirent_unlock(dl); 4360 4361 VN_RELE(ZTOV(zp)); 4362 4363 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4364 zil_commit(zilog, 0); 4365 4366 ZFS_EXIT(zfsvfs); 4367 return (error); 4368 } 4369 4370 /* 4371 * Return, in the buffer contained in the provided uio structure, 4372 * the symbolic path referred to by vp. 4373 * 4374 * IN: vp - vnode of symbolic link. 4375 * uio - structure to contain the link path. 4376 * cr - credentials of caller. 4377 * ct - caller context 4378 * 4379 * OUT: uio - structure containing the link path. 4380 * 4381 * RETURN: 0 on success, error code on failure. 4382 * 4383 * Timestamps: 4384 * vp - atime updated 4385 */ 4386 /* ARGSUSED */ 4387 static int 4388 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4389 { 4390 znode_t *zp = VTOZ(vp); 4391 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4392 int error; 4393 4394 ZFS_ENTER(zfsvfs); 4395 ZFS_VERIFY_ZP(zp); 4396 4397 mutex_enter(&zp->z_lock); 4398 if (zp->z_is_sa) 4399 error = sa_lookup_uio(zp->z_sa_hdl, 4400 SA_ZPL_SYMLINK(zfsvfs), uio); 4401 else 4402 error = zfs_sa_readlink(zp, uio); 4403 mutex_exit(&zp->z_lock); 4404 4405 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4406 4407 ZFS_EXIT(zfsvfs); 4408 return (error); 4409 } 4410 4411 /* 4412 * Insert a new entry into directory tdvp referencing svp. 4413 * 4414 * IN: tdvp - Directory to contain new entry. 4415 * svp - vnode of new entry. 4416 * name - name of new entry. 4417 * cr - credentials of caller. 4418 * ct - caller context 4419 * 4420 * RETURN: 0 on success, error code on failure. 4421 * 4422 * Timestamps: 4423 * tdvp - ctime|mtime updated 4424 * svp - ctime updated 4425 */ 4426 /* ARGSUSED */ 4427 static int 4428 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4429 caller_context_t *ct, int flags) 4430 { 4431 znode_t *dzp = VTOZ(tdvp); 4432 znode_t *tzp, *szp; 4433 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4434 zilog_t *zilog; 4435 zfs_dirlock_t *dl; 4436 dmu_tx_t *tx; 4437 vnode_t *realvp; 4438 int error; 4439 int zf = ZNEW; 4440 uint64_t parent; 4441 uid_t owner; 4442 boolean_t waited = B_FALSE; 4443 4444 ASSERT(tdvp->v_type == VDIR); 4445 4446 ZFS_ENTER(zfsvfs); 4447 ZFS_VERIFY_ZP(dzp); 4448 zilog = zfsvfs->z_log; 4449 4450 if (VOP_REALVP(svp, &realvp, ct) == 0) 4451 svp = realvp; 4452 4453 /* 4454 * POSIX dictates that we return EPERM here. 4455 * Better choices include ENOTSUP or EISDIR. 4456 */ 4457 if (svp->v_type == VDIR) { 4458 ZFS_EXIT(zfsvfs); 4459 return (SET_ERROR(EPERM)); 4460 } 4461 4462 szp = VTOZ(svp); 4463 ZFS_VERIFY_ZP(szp); 4464 4465 /* 4466 * If we are using project inheritance, it means if the directory has 4467 * ZFS_PROJINHERIT set, then its descendant directories will inherit 4468 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 4469 * such case, we only allow hard link creation in our tree when the 4470 * project IDs are the same. 4471 */ 4472 if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) { 4473 ZFS_EXIT(zfsvfs); 4474 return (SET_ERROR(EXDEV)); 4475 } 4476 4477 /* 4478 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 4479 * ctldir appear to have the same v_vfsp. 4480 */ 4481 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { 4482 ZFS_EXIT(zfsvfs); 4483 return (SET_ERROR(EXDEV)); 4484 } 4485 4486 /* Prevent links to .zfs/shares files */ 4487 4488 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4489 &parent, sizeof (uint64_t))) != 0) { 4490 ZFS_EXIT(zfsvfs); 4491 return (error); 4492 } 4493 if (parent == zfsvfs->z_shares_dir) { 4494 ZFS_EXIT(zfsvfs); 4495 return (SET_ERROR(EPERM)); 4496 } 4497 4498 if (zfsvfs->z_utf8 && u8_validate(name, 4499 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4500 ZFS_EXIT(zfsvfs); 4501 return (SET_ERROR(EILSEQ)); 4502 } 4503 if (flags & FIGNORECASE) 4504 zf |= ZCILOOK; 4505 4506 /* 4507 * We do not support links between attributes and non-attributes 4508 * because of the potential security risk of creating links 4509 * into "normal" file space in order to circumvent restrictions 4510 * imposed in attribute space. 4511 */ 4512 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4513 ZFS_EXIT(zfsvfs); 4514 return (SET_ERROR(EINVAL)); 4515 } 4516 4517 4518 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4519 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 4520 ZFS_EXIT(zfsvfs); 4521 return (SET_ERROR(EPERM)); 4522 } 4523 4524 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4525 ZFS_EXIT(zfsvfs); 4526 return (error); 4527 } 4528 4529 top: 4530 /* 4531 * Attempt to lock directory; fail if entry already exists. 4532 */ 4533 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 4534 if (error) { 4535 ZFS_EXIT(zfsvfs); 4536 return (error); 4537 } 4538 4539 tx = dmu_tx_create(zfsvfs->z_os); 4540 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4541 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4542 zfs_sa_upgrade_txholds(tx, szp); 4543 zfs_sa_upgrade_txholds(tx, dzp); 4544 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 4545 if (error) { 4546 zfs_dirent_unlock(dl); 4547 if (error == ERESTART) { 4548 waited = B_TRUE; 4549 dmu_tx_wait(tx); 4550 dmu_tx_abort(tx); 4551 goto top; 4552 } 4553 dmu_tx_abort(tx); 4554 ZFS_EXIT(zfsvfs); 4555 return (error); 4556 } 4557 4558 error = zfs_link_create(dl, szp, tx, 0); 4559 4560 if (error == 0) { 4561 uint64_t txtype = TX_LINK; 4562 if (flags & FIGNORECASE) 4563 txtype |= TX_CI; 4564 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4565 } 4566 4567 dmu_tx_commit(tx); 4568 4569 zfs_dirent_unlock(dl); 4570 4571 if (error == 0) { 4572 vnevent_link(svp, ct); 4573 } 4574 4575 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4576 zil_commit(zilog, 0); 4577 4578 ZFS_EXIT(zfsvfs); 4579 return (error); 4580 } 4581 4582 /* 4583 * zfs_null_putapage() is used when the file system has been force 4584 * unmounted. It just drops the pages. 4585 */ 4586 /* ARGSUSED */ 4587 static int 4588 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4589 size_t *lenp, int flags, cred_t *cr) 4590 { 4591 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 4592 return (0); 4593 } 4594 4595 /* 4596 * Push a page out to disk, klustering if possible. 4597 * 4598 * IN: vp - file to push page to. 4599 * pp - page to push. 4600 * flags - additional flags. 4601 * cr - credentials of caller. 4602 * 4603 * OUT: offp - start of range pushed. 4604 * lenp - len of range pushed. 4605 * 4606 * RETURN: 0 on success, error code on failure. 4607 * 4608 * NOTE: callers must have locked the page to be pushed. On 4609 * exit, the page (and all other pages in the kluster) must be 4610 * unlocked. 4611 */ 4612 /* ARGSUSED */ 4613 static int 4614 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4615 size_t *lenp, int flags, cred_t *cr) 4616 { 4617 znode_t *zp = VTOZ(vp); 4618 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4619 dmu_tx_t *tx; 4620 u_offset_t off, koff; 4621 size_t len, klen; 4622 int err; 4623 4624 off = pp->p_offset; 4625 len = PAGESIZE; 4626 /* 4627 * If our blocksize is bigger than the page size, try to kluster 4628 * multiple pages so that we write a full block (thus avoiding 4629 * a read-modify-write). 4630 */ 4631 if (off < zp->z_size && zp->z_blksz > PAGESIZE) { 4632 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 4633 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; 4634 ASSERT(koff <= zp->z_size); 4635 if (koff + klen > zp->z_size) 4636 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); 4637 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 4638 } 4639 ASSERT3U(btop(len), ==, btopr(len)); 4640 4641 /* 4642 * Can't push pages past end-of-file. 4643 */ 4644 if (off >= zp->z_size) { 4645 /* ignore all pages */ 4646 err = 0; 4647 goto out; 4648 } else if (off + len > zp->z_size) { 4649 int npages = btopr(zp->z_size - off); 4650 page_t *trunc; 4651 4652 page_list_break(&pp, &trunc, npages); 4653 /* ignore pages past end of file */ 4654 if (trunc) 4655 pvn_write_done(trunc, flags); 4656 len = zp->z_size - off; 4657 } 4658 4659 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || 4660 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid)) { 4661 err = SET_ERROR(EDQUOT); 4662 goto out; 4663 } 4664 tx = dmu_tx_create(zfsvfs->z_os); 4665 dmu_tx_hold_write(tx, zp->z_id, off, len); 4666 4667 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4668 zfs_sa_upgrade_txholds(tx, zp); 4669 err = dmu_tx_assign(tx, TXG_WAIT); 4670 if (err != 0) { 4671 dmu_tx_abort(tx); 4672 goto out; 4673 } 4674 4675 if (zp->z_blksz <= PAGESIZE) { 4676 caddr_t va = zfs_map_page(pp, S_READ); 4677 ASSERT3U(len, <=, PAGESIZE); 4678 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 4679 zfs_unmap_page(pp, va); 4680 } else { 4681 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 4682 } 4683 4684 if (err == 0) { 4685 uint64_t mtime[2], ctime[2]; 4686 sa_bulk_attr_t bulk[3]; 4687 int count = 0; 4688 4689 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4690 &mtime, 16); 4691 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4692 &ctime, 16); 4693 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4694 &zp->z_pflags, 8); 4695 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4696 B_TRUE); 4697 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4698 ASSERT0(err); 4699 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4700 } 4701 dmu_tx_commit(tx); 4702 4703 out: 4704 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 4705 if (offp) 4706 *offp = off; 4707 if (lenp) 4708 *lenp = len; 4709 4710 return (err); 4711 } 4712 4713 /* 4714 * Copy the portion of the file indicated from pages into the file. 4715 * The pages are stored in a page list attached to the files vnode. 4716 * 4717 * IN: vp - vnode of file to push page data to. 4718 * off - position in file to put data. 4719 * len - amount of data to write. 4720 * flags - flags to control the operation. 4721 * cr - credentials of caller. 4722 * ct - caller context. 4723 * 4724 * RETURN: 0 on success, error code on failure. 4725 * 4726 * Timestamps: 4727 * vp - ctime|mtime updated 4728 */ 4729 /*ARGSUSED*/ 4730 static int 4731 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4732 caller_context_t *ct) 4733 { 4734 znode_t *zp = VTOZ(vp); 4735 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4736 page_t *pp; 4737 size_t io_len; 4738 u_offset_t io_off; 4739 uint_t blksz; 4740 locked_range_t *lr; 4741 int error = 0; 4742 4743 ZFS_ENTER(zfsvfs); 4744 ZFS_VERIFY_ZP(zp); 4745 4746 /* 4747 * There's nothing to do if no data is cached. 4748 */ 4749 if (!vn_has_cached_data(vp)) { 4750 ZFS_EXIT(zfsvfs); 4751 return (0); 4752 } 4753 4754 /* 4755 * Align this request to the file block size in case we kluster. 4756 * XXX - this can result in pretty aggresive locking, which can 4757 * impact simultanious read/write access. One option might be 4758 * to break up long requests (len == 0) into block-by-block 4759 * operations to get narrower locking. 4760 */ 4761 blksz = zp->z_blksz; 4762 if (ISP2(blksz)) 4763 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 4764 else 4765 io_off = 0; 4766 if (len > 0 && ISP2(blksz)) 4767 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 4768 else 4769 io_len = 0; 4770 4771 if (io_len == 0) { 4772 /* 4773 * Search the entire vp list for pages >= io_off. 4774 */ 4775 lr = rangelock_enter(&zp->z_rangelock, 4776 io_off, UINT64_MAX, RL_WRITER); 4777 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 4778 goto out; 4779 } 4780 lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER); 4781 4782 if (off > zp->z_size) { 4783 /* past end of file */ 4784 rangelock_exit(lr); 4785 ZFS_EXIT(zfsvfs); 4786 return (0); 4787 } 4788 4789 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); 4790 4791 for (off = io_off; io_off < off + len; io_off += io_len) { 4792 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 4793 pp = page_lookup(vp, io_off, 4794 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 4795 } else { 4796 pp = page_lookup_nowait(vp, io_off, 4797 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 4798 } 4799 4800 if (pp != NULL && pvn_getdirty(pp, flags)) { 4801 int err; 4802 4803 /* 4804 * Found a dirty page to push 4805 */ 4806 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 4807 if (err) 4808 error = err; 4809 } else { 4810 io_len = PAGESIZE; 4811 } 4812 } 4813 out: 4814 rangelock_exit(lr); 4815 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4816 zil_commit(zfsvfs->z_log, zp->z_id); 4817 ZFS_EXIT(zfsvfs); 4818 return (error); 4819 } 4820 4821 /*ARGSUSED*/ 4822 void 4823 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4824 { 4825 znode_t *zp = VTOZ(vp); 4826 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4827 int error; 4828 4829 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4830 if (zp->z_sa_hdl == NULL) { 4831 /* 4832 * The fs has been unmounted, or we did a 4833 * suspend/resume and this file no longer exists. 4834 */ 4835 if (vn_has_cached_data(vp)) { 4836 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 4837 B_INVAL, cr); 4838 } 4839 4840 mutex_enter(&zp->z_lock); 4841 mutex_enter(&vp->v_lock); 4842 ASSERT(vp->v_count == 1); 4843 VN_RELE_LOCKED(vp); 4844 mutex_exit(&vp->v_lock); 4845 mutex_exit(&zp->z_lock); 4846 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4847 zfs_znode_free(zp); 4848 return; 4849 } 4850 4851 /* 4852 * Attempt to push any data in the page cache. If this fails 4853 * we will get kicked out later in zfs_zinactive(). 4854 */ 4855 if (vn_has_cached_data(vp)) { 4856 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 4857 cr); 4858 } 4859 4860 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4861 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4862 4863 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4864 zfs_sa_upgrade_txholds(tx, zp); 4865 error = dmu_tx_assign(tx, TXG_WAIT); 4866 if (error) { 4867 dmu_tx_abort(tx); 4868 } else { 4869 mutex_enter(&zp->z_lock); 4870 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4871 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4872 zp->z_atime_dirty = 0; 4873 mutex_exit(&zp->z_lock); 4874 dmu_tx_commit(tx); 4875 } 4876 } 4877 4878 zfs_zinactive(zp); 4879 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4880 } 4881 4882 /* 4883 * Bounds-check the seek operation. 4884 * 4885 * IN: vp - vnode seeking within 4886 * ooff - old file offset 4887 * noffp - pointer to new file offset 4888 * ct - caller context 4889 * 4890 * RETURN: 0 on success, EINVAL if new offset invalid. 4891 */ 4892 /* ARGSUSED */ 4893 static int 4894 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 4895 caller_context_t *ct) 4896 { 4897 if (vp->v_type == VDIR) 4898 return (0); 4899 return ((*noffp < 0) ? EINVAL : 0); 4900 } 4901 4902 /* 4903 * Pre-filter the generic locking function to trap attempts to place 4904 * a mandatory lock on a memory mapped file. 4905 */ 4906 static int 4907 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 4908 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 4909 { 4910 znode_t *zp = VTOZ(vp); 4911 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4912 4913 ZFS_ENTER(zfsvfs); 4914 ZFS_VERIFY_ZP(zp); 4915 4916 /* 4917 * We are following the UFS semantics with respect to mapcnt 4918 * here: If we see that the file is mapped already, then we will 4919 * return an error, but we don't worry about races between this 4920 * function and zfs_map(). 4921 */ 4922 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { 4923 ZFS_EXIT(zfsvfs); 4924 return (SET_ERROR(EAGAIN)); 4925 } 4926 ZFS_EXIT(zfsvfs); 4927 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4928 } 4929 4930 /* 4931 * If we can't find a page in the cache, we will create a new page 4932 * and fill it with file data. For efficiency, we may try to fill 4933 * multiple pages at once (klustering) to fill up the supplied page 4934 * list. Note that the pages to be filled are held with an exclusive 4935 * lock to prevent access by other threads while they are being filled. 4936 */ 4937 static int 4938 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 4939 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 4940 { 4941 znode_t *zp = VTOZ(vp); 4942 page_t *pp, *cur_pp; 4943 objset_t *os = zp->z_zfsvfs->z_os; 4944 u_offset_t io_off, total; 4945 size_t io_len; 4946 int err; 4947 4948 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 4949 /* 4950 * We only have a single page, don't bother klustering 4951 */ 4952 io_off = off; 4953 io_len = PAGESIZE; 4954 pp = page_create_va(vp, io_off, io_len, 4955 PG_EXCL | PG_WAIT, seg, addr); 4956 } else { 4957 /* 4958 * Try to find enough pages to fill the page list 4959 */ 4960 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4961 &io_len, off, plsz, 0); 4962 } 4963 if (pp == NULL) { 4964 /* 4965 * The page already exists, nothing to do here. 4966 */ 4967 *pl = NULL; 4968 return (0); 4969 } 4970 4971 /* 4972 * Fill the pages in the kluster. 4973 */ 4974 cur_pp = pp; 4975 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 4976 caddr_t va; 4977 4978 ASSERT3U(io_off, ==, cur_pp->p_offset); 4979 va = zfs_map_page(cur_pp, S_WRITE); 4980 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 4981 DMU_READ_PREFETCH); 4982 zfs_unmap_page(cur_pp, va); 4983 if (err) { 4984 /* On error, toss the entire kluster */ 4985 pvn_read_done(pp, B_ERROR); 4986 /* convert checksum errors into IO errors */ 4987 if (err == ECKSUM) 4988 err = SET_ERROR(EIO); 4989 return (err); 4990 } 4991 cur_pp = cur_pp->p_next; 4992 } 4993 4994 /* 4995 * Fill in the page list array from the kluster starting 4996 * from the desired offset `off'. 4997 * NOTE: the page list will always be null terminated. 4998 */ 4999 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 5000 ASSERT(pl == NULL || (*pl)->p_offset == off); 5001 5002 return (0); 5003 } 5004 5005 /* 5006 * Return pointers to the pages for the file region [off, off + len] 5007 * in the pl array. If plsz is greater than len, this function may 5008 * also return page pointers from after the specified region 5009 * (i.e. the region [off, off + plsz]). These additional pages are 5010 * only returned if they are already in the cache, or were created as 5011 * part of a klustered read. 5012 * 5013 * IN: vp - vnode of file to get data from. 5014 * off - position in file to get data from. 5015 * len - amount of data to retrieve. 5016 * plsz - length of provided page list. 5017 * seg - segment to obtain pages for. 5018 * addr - virtual address of fault. 5019 * rw - mode of created pages. 5020 * cr - credentials of caller. 5021 * ct - caller context. 5022 * 5023 * OUT: protp - protection mode of created pages. 5024 * pl - list of pages created. 5025 * 5026 * RETURN: 0 on success, error code on failure. 5027 * 5028 * Timestamps: 5029 * vp - atime updated 5030 */ 5031 /* ARGSUSED */ 5032 static int 5033 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 5034 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 5035 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 5036 { 5037 znode_t *zp = VTOZ(vp); 5038 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5039 page_t **pl0 = pl; 5040 int err = 0; 5041 5042 /* we do our own caching, faultahead is unnecessary */ 5043 if (pl == NULL) 5044 return (0); 5045 else if (len > plsz) 5046 len = plsz; 5047 else 5048 len = P2ROUNDUP(len, PAGESIZE); 5049 ASSERT(plsz >= len); 5050 5051 ZFS_ENTER(zfsvfs); 5052 ZFS_VERIFY_ZP(zp); 5053 5054 if (protp) 5055 *protp = PROT_ALL; 5056 5057 /* 5058 * Loop through the requested range [off, off + len) looking 5059 * for pages. If we don't find a page, we will need to create 5060 * a new page and fill it with data from the file. 5061 */ 5062 while (len > 0) { 5063 if (*pl = page_lookup(vp, off, SE_SHARED)) 5064 *(pl+1) = NULL; 5065 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 5066 goto out; 5067 while (*pl) { 5068 ASSERT3U((*pl)->p_offset, ==, off); 5069 off += PAGESIZE; 5070 addr += PAGESIZE; 5071 if (len > 0) { 5072 ASSERT3U(len, >=, PAGESIZE); 5073 len -= PAGESIZE; 5074 } 5075 ASSERT3U(plsz, >=, PAGESIZE); 5076 plsz -= PAGESIZE; 5077 pl++; 5078 } 5079 } 5080 5081 /* 5082 * Fill out the page array with any pages already in the cache. 5083 */ 5084 while (plsz > 0 && 5085 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 5086 off += PAGESIZE; 5087 plsz -= PAGESIZE; 5088 } 5089 out: 5090 if (err) { 5091 /* 5092 * Release any pages we have previously locked. 5093 */ 5094 while (pl > pl0) 5095 page_unlock(*--pl); 5096 } else { 5097 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 5098 } 5099 5100 *pl = NULL; 5101 5102 ZFS_EXIT(zfsvfs); 5103 return (err); 5104 } 5105 5106 /* 5107 * Request a memory map for a section of a file. This code interacts 5108 * with common code and the VM system as follows: 5109 * 5110 * - common code calls mmap(), which ends up in smmap_common() 5111 * - this calls VOP_MAP(), which takes you into (say) zfs 5112 * - zfs_map() calls as_map(), passing segvn_create() as the callback 5113 * - segvn_create() creates the new segment and calls VOP_ADDMAP() 5114 * - zfs_addmap() updates z_mapcnt 5115 */ 5116 /*ARGSUSED*/ 5117 static int 5118 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 5119 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 5120 caller_context_t *ct) 5121 { 5122 znode_t *zp = VTOZ(vp); 5123 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5124 segvn_crargs_t vn_a; 5125 int error; 5126 5127 ZFS_ENTER(zfsvfs); 5128 ZFS_VERIFY_ZP(zp); 5129 5130 /* 5131 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 5132 */ 5133 5134 if ((prot & PROT_WRITE) && (zp->z_pflags & 5135 (ZFS_IMMUTABLE | ZFS_APPENDONLY))) { 5136 ZFS_EXIT(zfsvfs); 5137 return (SET_ERROR(EPERM)); 5138 } 5139 5140 if ((prot & (PROT_READ | PROT_EXEC)) && 5141 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 5142 ZFS_EXIT(zfsvfs); 5143 return (SET_ERROR(EACCES)); 5144 } 5145 5146 if (vp->v_flag & VNOMAP) { 5147 ZFS_EXIT(zfsvfs); 5148 return (SET_ERROR(ENOSYS)); 5149 } 5150 5151 if (off < 0 || len > MAXOFFSET_T - off) { 5152 ZFS_EXIT(zfsvfs); 5153 return (SET_ERROR(ENXIO)); 5154 } 5155 5156 if (vp->v_type != VREG) { 5157 ZFS_EXIT(zfsvfs); 5158 return (SET_ERROR(ENODEV)); 5159 } 5160 5161 /* 5162 * If file is locked, disallow mapping. 5163 */ 5164 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { 5165 ZFS_EXIT(zfsvfs); 5166 return (SET_ERROR(EAGAIN)); 5167 } 5168 5169 as_rangelock(as); 5170 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 5171 if (error != 0) { 5172 as_rangeunlock(as); 5173 ZFS_EXIT(zfsvfs); 5174 return (error); 5175 } 5176 5177 vn_a.vp = vp; 5178 vn_a.offset = (u_offset_t)off; 5179 vn_a.type = flags & MAP_TYPE; 5180 vn_a.prot = prot; 5181 vn_a.maxprot = maxprot; 5182 vn_a.cred = cr; 5183 vn_a.amp = NULL; 5184 vn_a.flags = flags & ~MAP_TYPE; 5185 vn_a.szc = 0; 5186 vn_a.lgrp_mem_policy_flags = 0; 5187 5188 error = as_map(as, *addrp, len, segvn_create, &vn_a); 5189 5190 as_rangeunlock(as); 5191 ZFS_EXIT(zfsvfs); 5192 return (error); 5193 } 5194 5195 /* ARGSUSED */ 5196 static int 5197 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5198 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 5199 caller_context_t *ct) 5200 { 5201 uint64_t pages = btopr(len); 5202 5203 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 5204 return (0); 5205 } 5206 5207 /* ARGSUSED */ 5208 static int 5209 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5210 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 5211 caller_context_t *ct) 5212 { 5213 uint64_t pages = btopr(len); 5214 5215 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 5216 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 5217 5218 return (0); 5219 } 5220 5221 /* 5222 * Free or allocate space in a file. Currently, this function only 5223 * supports the `F_FREESP' command. However, this command is somewhat 5224 * misnamed, as its functionality includes the ability to allocate as 5225 * well as free space. 5226 * 5227 * IN: vp - vnode of file to free data in. 5228 * cmd - action to take (only F_FREESP supported). 5229 * bfp - section of file to free/alloc. 5230 * flag - current file open mode flags. 5231 * offset - current file offset. 5232 * cr - credentials of caller [UNUSED]. 5233 * ct - caller context. 5234 * 5235 * RETURN: 0 on success, error code on failure. 5236 * 5237 * Timestamps: 5238 * vp - ctime|mtime updated 5239 */ 5240 /* ARGSUSED */ 5241 static int 5242 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 5243 offset_t offset, cred_t *cr, caller_context_t *ct) 5244 { 5245 znode_t *zp = VTOZ(vp); 5246 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5247 uint64_t off, len; 5248 int error; 5249 5250 ZFS_ENTER(zfsvfs); 5251 ZFS_VERIFY_ZP(zp); 5252 5253 if (cmd != F_FREESP) { 5254 ZFS_EXIT(zfsvfs); 5255 return (SET_ERROR(EINVAL)); 5256 } 5257 5258 /* 5259 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 5260 * callers might not be able to detect properly that we are read-only, 5261 * so check it explicitly here. 5262 */ 5263 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 5264 ZFS_EXIT(zfsvfs); 5265 return (SET_ERROR(EROFS)); 5266 } 5267 5268 if (error = convoff(vp, bfp, 0, offset)) { 5269 ZFS_EXIT(zfsvfs); 5270 return (error); 5271 } 5272 5273 if (bfp->l_len < 0) { 5274 ZFS_EXIT(zfsvfs); 5275 return (SET_ERROR(EINVAL)); 5276 } 5277 5278 off = bfp->l_start; 5279 len = bfp->l_len; /* 0 means from off to end of file */ 5280 5281 error = zfs_freesp(zp, off, len, flag, TRUE); 5282 5283 if (error == 0 && off == 0 && len == 0) 5284 vnevent_truncate(ZTOV(zp), ct); 5285 5286 ZFS_EXIT(zfsvfs); 5287 return (error); 5288 } 5289 5290 /*ARGSUSED*/ 5291 static int 5292 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 5293 { 5294 znode_t *zp = VTOZ(vp); 5295 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5296 uint32_t gen; 5297 uint64_t gen64; 5298 uint64_t object = zp->z_id; 5299 zfid_short_t *zfid; 5300 int size, i, error; 5301 5302 ZFS_ENTER(zfsvfs); 5303 ZFS_VERIFY_ZP(zp); 5304 5305 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 5306 &gen64, sizeof (uint64_t))) != 0) { 5307 ZFS_EXIT(zfsvfs); 5308 return (error); 5309 } 5310 5311 gen = (uint32_t)gen64; 5312 5313 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 5314 if (fidp->fid_len < size) { 5315 fidp->fid_len = size; 5316 ZFS_EXIT(zfsvfs); 5317 return (SET_ERROR(ENOSPC)); 5318 } 5319 5320 zfid = (zfid_short_t *)fidp; 5321 5322 zfid->zf_len = size; 5323 5324 for (i = 0; i < sizeof (zfid->zf_object); i++) 5325 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 5326 5327 /* Must have a non-zero generation number to distinguish from .zfs */ 5328 if (gen == 0) 5329 gen = 1; 5330 for (i = 0; i < sizeof (zfid->zf_gen); i++) 5331 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 5332 5333 if (size == LONG_FID_LEN) { 5334 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 5335 zfid_long_t *zlfid; 5336 5337 zlfid = (zfid_long_t *)fidp; 5338 5339 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 5340 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 5341 5342 /* XXX - this should be the generation number for the objset */ 5343 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 5344 zlfid->zf_setgen[i] = 0; 5345 } 5346 5347 ZFS_EXIT(zfsvfs); 5348 return (0); 5349 } 5350 5351 static int 5352 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 5353 caller_context_t *ct) 5354 { 5355 znode_t *zp, *xzp; 5356 zfsvfs_t *zfsvfs; 5357 zfs_dirlock_t *dl; 5358 int error; 5359 5360 switch (cmd) { 5361 case _PC_LINK_MAX: 5362 *valp = ULONG_MAX; 5363 return (0); 5364 5365 case _PC_FILESIZEBITS: 5366 *valp = 64; 5367 return (0); 5368 5369 case _PC_XATTR_EXISTS: 5370 zp = VTOZ(vp); 5371 zfsvfs = zp->z_zfsvfs; 5372 ZFS_ENTER(zfsvfs); 5373 ZFS_VERIFY_ZP(zp); 5374 *valp = 0; 5375 error = zfs_dirent_lock(&dl, zp, "", &xzp, 5376 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 5377 if (error == 0) { 5378 zfs_dirent_unlock(dl); 5379 if (!zfs_dirempty(xzp)) 5380 *valp = 1; 5381 VN_RELE(ZTOV(xzp)); 5382 } else if (error == ENOENT) { 5383 /* 5384 * If there aren't extended attributes, it's the 5385 * same as having zero of them. 5386 */ 5387 error = 0; 5388 } 5389 ZFS_EXIT(zfsvfs); 5390 return (error); 5391 5392 case _PC_SATTR_ENABLED: 5393 case _PC_SATTR_EXISTS: 5394 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 5395 (vp->v_type == VREG || vp->v_type == VDIR); 5396 return (0); 5397 5398 case _PC_ACCESS_FILTERING: 5399 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 5400 vp->v_type == VDIR; 5401 return (0); 5402 5403 case _PC_ACL_ENABLED: 5404 *valp = _ACL_ACE_ENABLED; 5405 return (0); 5406 5407 case _PC_MIN_HOLE_SIZE: 5408 *valp = (ulong_t)SPA_MINBLOCKSIZE; 5409 return (0); 5410 5411 case _PC_TIMESTAMP_RESOLUTION: 5412 /* nanosecond timestamp resolution */ 5413 *valp = 1L; 5414 return (0); 5415 5416 default: 5417 return (fs_pathconf(vp, cmd, valp, cr, ct)); 5418 } 5419 } 5420 5421 /*ARGSUSED*/ 5422 static int 5423 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5424 caller_context_t *ct) 5425 { 5426 znode_t *zp = VTOZ(vp); 5427 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5428 int error; 5429 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5430 5431 ZFS_ENTER(zfsvfs); 5432 ZFS_VERIFY_ZP(zp); 5433 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 5434 ZFS_EXIT(zfsvfs); 5435 5436 return (error); 5437 } 5438 5439 /*ARGSUSED*/ 5440 static int 5441 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5442 caller_context_t *ct) 5443 { 5444 znode_t *zp = VTOZ(vp); 5445 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5446 int error; 5447 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5448 zilog_t *zilog = zfsvfs->z_log; 5449 5450 ZFS_ENTER(zfsvfs); 5451 ZFS_VERIFY_ZP(zp); 5452 5453 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 5454 5455 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 5456 zil_commit(zilog, 0); 5457 5458 ZFS_EXIT(zfsvfs); 5459 return (error); 5460 } 5461 5462 /* 5463 * The smallest read we may consider to loan out an arcbuf. 5464 * This must be a power of 2. 5465 */ 5466 int zcr_blksz_min = (1 << 10); /* 1K */ 5467 /* 5468 * If set to less than the file block size, allow loaning out of an 5469 * arcbuf for a partial block read. This must be a power of 2. 5470 */ 5471 int zcr_blksz_max = (1 << 17); /* 128K */ 5472 5473 /*ARGSUSED*/ 5474 static int 5475 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, 5476 caller_context_t *ct) 5477 { 5478 znode_t *zp = VTOZ(vp); 5479 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5480 int max_blksz = zfsvfs->z_max_blksz; 5481 uio_t *uio = &xuio->xu_uio; 5482 ssize_t size = uio->uio_resid; 5483 offset_t offset = uio->uio_loffset; 5484 int blksz; 5485 int fullblk, i; 5486 arc_buf_t *abuf; 5487 ssize_t maxsize; 5488 int preamble, postamble; 5489 5490 if (xuio->xu_type != UIOTYPE_ZEROCOPY) 5491 return (SET_ERROR(EINVAL)); 5492 5493 ZFS_ENTER(zfsvfs); 5494 ZFS_VERIFY_ZP(zp); 5495 switch (ioflag) { 5496 case UIO_WRITE: 5497 /* 5498 * Loan out an arc_buf for write if write size is bigger than 5499 * max_blksz, and the file's block size is also max_blksz. 5500 */ 5501 blksz = max_blksz; 5502 if (size < blksz || zp->z_blksz != blksz) { 5503 ZFS_EXIT(zfsvfs); 5504 return (SET_ERROR(EINVAL)); 5505 } 5506 /* 5507 * Caller requests buffers for write before knowing where the 5508 * write offset might be (e.g. NFS TCP write). 5509 */ 5510 if (offset == -1) { 5511 preamble = 0; 5512 } else { 5513 preamble = P2PHASE(offset, blksz); 5514 if (preamble) { 5515 preamble = blksz - preamble; 5516 size -= preamble; 5517 } 5518 } 5519 5520 postamble = P2PHASE(size, blksz); 5521 size -= postamble; 5522 5523 fullblk = size / blksz; 5524 (void) dmu_xuio_init(xuio, 5525 (preamble != 0) + fullblk + (postamble != 0)); 5526 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, 5527 int, postamble, int, 5528 (preamble != 0) + fullblk + (postamble != 0)); 5529 5530 /* 5531 * Have to fix iov base/len for partial buffers. They 5532 * currently represent full arc_buf's. 5533 */ 5534 if (preamble) { 5535 /* data begins in the middle of the arc_buf */ 5536 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5537 blksz); 5538 ASSERT(abuf); 5539 (void) dmu_xuio_add(xuio, abuf, 5540 blksz - preamble, preamble); 5541 } 5542 5543 for (i = 0; i < fullblk; i++) { 5544 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5545 blksz); 5546 ASSERT(abuf); 5547 (void) dmu_xuio_add(xuio, abuf, 0, blksz); 5548 } 5549 5550 if (postamble) { 5551 /* data ends in the middle of the arc_buf */ 5552 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5553 blksz); 5554 ASSERT(abuf); 5555 (void) dmu_xuio_add(xuio, abuf, 0, postamble); 5556 } 5557 break; 5558 case UIO_READ: 5559 /* 5560 * Loan out an arc_buf for read if the read size is larger than 5561 * the current file block size. Block alignment is not 5562 * considered. Partial arc_buf will be loaned out for read. 5563 */ 5564 blksz = zp->z_blksz; 5565 if (blksz < zcr_blksz_min) 5566 blksz = zcr_blksz_min; 5567 if (blksz > zcr_blksz_max) 5568 blksz = zcr_blksz_max; 5569 /* avoid potential complexity of dealing with it */ 5570 if (blksz > max_blksz) { 5571 ZFS_EXIT(zfsvfs); 5572 return (SET_ERROR(EINVAL)); 5573 } 5574 5575 maxsize = zp->z_size - uio->uio_loffset; 5576 if (size > maxsize) 5577 size = maxsize; 5578 5579 if (size < blksz || vn_has_cached_data(vp)) { 5580 ZFS_EXIT(zfsvfs); 5581 return (SET_ERROR(EINVAL)); 5582 } 5583 break; 5584 default: 5585 ZFS_EXIT(zfsvfs); 5586 return (SET_ERROR(EINVAL)); 5587 } 5588 5589 uio->uio_extflg = UIO_XUIO; 5590 XUIO_XUZC_RW(xuio) = ioflag; 5591 ZFS_EXIT(zfsvfs); 5592 return (0); 5593 } 5594 5595 /*ARGSUSED*/ 5596 static int 5597 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) 5598 { 5599 int i; 5600 arc_buf_t *abuf; 5601 int ioflag = XUIO_XUZC_RW(xuio); 5602 5603 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); 5604 5605 i = dmu_xuio_cnt(xuio); 5606 while (i-- > 0) { 5607 abuf = dmu_xuio_arcbuf(xuio, i); 5608 /* 5609 * if abuf == NULL, it must be a write buffer 5610 * that has been returned in zfs_write(). 5611 */ 5612 if (abuf) 5613 dmu_return_arcbuf(abuf); 5614 ASSERT(abuf || ioflag == UIO_WRITE); 5615 } 5616 5617 dmu_xuio_fini(xuio); 5618 return (0); 5619 } 5620 5621 /* 5622 * Predeclare these here so that the compiler assumes that 5623 * this is an "old style" function declaration that does 5624 * not include arguments => we won't get type mismatch errors 5625 * in the initializations that follow. 5626 */ 5627 static int zfs_inval(); 5628 static int zfs_isdir(); 5629 5630 static int 5631 zfs_inval() 5632 { 5633 return (SET_ERROR(EINVAL)); 5634 } 5635 5636 static int 5637 zfs_isdir() 5638 { 5639 return (SET_ERROR(EISDIR)); 5640 } 5641 /* 5642 * Directory vnode operations template 5643 */ 5644 const fs_operation_def_t zfs_dvnodeops_template[] = { 5645 VOPNAME_OPEN, { .vop_open = zfs_open }, 5646 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5647 VOPNAME_READ, { .error = zfs_isdir }, 5648 VOPNAME_WRITE, { .error = zfs_isdir }, 5649 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5650 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5651 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5652 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5653 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5654 VOPNAME_CREATE, { .vop_create = zfs_create }, 5655 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5656 VOPNAME_LINK, { .vop_link = zfs_link }, 5657 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5658 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 5659 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5660 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5661 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 5662 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5663 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5664 VOPNAME_FID, { .vop_fid = zfs_fid }, 5665 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5666 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5667 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5668 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5669 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5670 NULL, NULL 5671 }; 5672 5673 /* 5674 * Regular file vnode operations template 5675 */ 5676 const fs_operation_def_t zfs_fvnodeops_template[] = { 5677 VOPNAME_OPEN, { .vop_open = zfs_open }, 5678 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5679 VOPNAME_READ, { .vop_read = zfs_read }, 5680 VOPNAME_WRITE, { .vop_write = zfs_write }, 5681 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5682 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5683 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5684 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5685 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5686 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5687 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5688 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5689 VOPNAME_FID, { .vop_fid = zfs_fid }, 5690 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5691 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 5692 VOPNAME_SPACE, { .vop_space = zfs_space }, 5693 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 5694 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 5695 VOPNAME_MAP, { .vop_map = zfs_map }, 5696 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 5697 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 5698 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5699 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5700 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5701 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5702 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, 5703 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, 5704 NULL, NULL 5705 }; 5706 5707 /* 5708 * Symbolic link vnode operations template 5709 */ 5710 const fs_operation_def_t zfs_symvnodeops_template[] = { 5711 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5712 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5713 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5714 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5715 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 5716 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5717 VOPNAME_FID, { .vop_fid = zfs_fid }, 5718 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5719 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5720 NULL, NULL 5721 }; 5722 5723 /* 5724 * special share hidden files vnode operations template 5725 */ 5726 const fs_operation_def_t zfs_sharevnodeops_template[] = { 5727 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5728 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5729 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5730 VOPNAME_FID, { .vop_fid = zfs_fid }, 5731 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5732 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5733 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5734 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5735 NULL, NULL 5736 }; 5737 5738 /* 5739 * Extended attribute directory vnode operations template 5740 * 5741 * This template is identical to the directory vnodes 5742 * operation template except for restricted operations: 5743 * VOP_MKDIR() 5744 * VOP_SYMLINK() 5745 * 5746 * Note that there are other restrictions embedded in: 5747 * zfs_create() - restrict type to VREG 5748 * zfs_link() - no links into/out of attribute space 5749 * zfs_rename() - no moves into/out of attribute space 5750 */ 5751 const fs_operation_def_t zfs_xdvnodeops_template[] = { 5752 VOPNAME_OPEN, { .vop_open = zfs_open }, 5753 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5754 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5755 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5756 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5757 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5758 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5759 VOPNAME_CREATE, { .vop_create = zfs_create }, 5760 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5761 VOPNAME_LINK, { .vop_link = zfs_link }, 5762 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5763 VOPNAME_MKDIR, { .error = zfs_inval }, 5764 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5765 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5766 VOPNAME_SYMLINK, { .error = zfs_inval }, 5767 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5768 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5769 VOPNAME_FID, { .vop_fid = zfs_fid }, 5770 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5771 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5772 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5773 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5774 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5775 NULL, NULL 5776 }; 5777 5778 /* 5779 * Error vnode operations template 5780 */ 5781 const fs_operation_def_t zfs_evnodeops_template[] = { 5782 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5783 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5784 NULL, NULL 5785 }; 5786