1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/dsl_dataset.h> 29 #include <sys/dmu.h> 30 #include <sys/refcount.h> 31 #include <sys/zap.h> 32 #include <sys/zfs_context.h> 33 #include <sys/dsl_pool.h> 34 35 /* 36 * Deadlist concurrency: 37 * 38 * Deadlists can only be modified from the syncing thread. 39 * 40 * Except for dsl_deadlist_insert(), it can only be modified with the 41 * dp_config_rwlock held with RW_WRITER. 42 * 43 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 44 * be called concurrently, from open context, with the dl_config_rwlock held 45 * with RW_READER. 46 * 47 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 48 * the accessors, protecting: 49 * dl_phys->dl_used,comp,uncomp 50 * and protecting the dl_tree from being loaded. 51 * The locking is provided by dl_lock. Note that locking on the bpobj_t 52 * provides its own locking, and dl_oldfmt is immutable. 53 */ 54 55 static int 56 dsl_deadlist_compare(const void *arg1, const void *arg2) 57 { 58 const dsl_deadlist_entry_t *dle1 = arg1; 59 const dsl_deadlist_entry_t *dle2 = arg2; 60 61 if (dle1->dle_mintxg < dle2->dle_mintxg) 62 return (-1); 63 else if (dle1->dle_mintxg > dle2->dle_mintxg) 64 return (+1); 65 else 66 return (0); 67 } 68 69 static void 70 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 71 { 72 zap_cursor_t zc; 73 zap_attribute_t za; 74 75 ASSERT(MUTEX_HELD(&dl->dl_lock)); 76 77 ASSERT(!dl->dl_oldfmt); 78 if (dl->dl_havetree) 79 return; 80 81 avl_create(&dl->dl_tree, dsl_deadlist_compare, 82 sizeof (dsl_deadlist_entry_t), 83 offsetof(dsl_deadlist_entry_t, dle_node)); 84 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 85 zap_cursor_retrieve(&zc, &za) == 0; 86 zap_cursor_advance(&zc)) { 87 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 88 dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); 89 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 90 za.za_first_integer)); 91 avl_add(&dl->dl_tree, dle); 92 } 93 zap_cursor_fini(&zc); 94 dl->dl_havetree = B_TRUE; 95 } 96 97 void 98 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 99 { 100 dmu_object_info_t doi; 101 102 ASSERT(!dsl_deadlist_is_open(dl)); 103 104 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 105 dl->dl_os = os; 106 dl->dl_object = object; 107 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 108 dmu_object_info_from_db(dl->dl_dbuf, &doi); 109 if (doi.doi_type == DMU_OT_BPOBJ) { 110 dmu_buf_rele(dl->dl_dbuf, dl); 111 dl->dl_dbuf = NULL; 112 dl->dl_oldfmt = B_TRUE; 113 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 114 return; 115 } 116 117 dl->dl_oldfmt = B_FALSE; 118 dl->dl_phys = dl->dl_dbuf->db_data; 119 dl->dl_havetree = B_FALSE; 120 } 121 122 boolean_t 123 dsl_deadlist_is_open(dsl_deadlist_t *dl) 124 { 125 return (dl->dl_os != NULL); 126 } 127 128 void 129 dsl_deadlist_close(dsl_deadlist_t *dl) 130 { 131 void *cookie = NULL; 132 dsl_deadlist_entry_t *dle; 133 134 ASSERT(dsl_deadlist_is_open(dl)); 135 136 if (dl->dl_oldfmt) { 137 dl->dl_oldfmt = B_FALSE; 138 bpobj_close(&dl->dl_bpobj); 139 dl->dl_os = NULL; 140 dl->dl_object = 0; 141 return; 142 } 143 144 if (dl->dl_havetree) { 145 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 146 != NULL) { 147 bpobj_close(&dle->dle_bpobj); 148 kmem_free(dle, sizeof (*dle)); 149 } 150 avl_destroy(&dl->dl_tree); 151 } 152 dmu_buf_rele(dl->dl_dbuf, dl); 153 mutex_destroy(&dl->dl_lock); 154 dl->dl_dbuf = NULL; 155 dl->dl_phys = NULL; 156 dl->dl_os = NULL; 157 dl->dl_object = 0; 158 } 159 160 uint64_t 161 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 162 { 163 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 164 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 165 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 166 sizeof (dsl_deadlist_phys_t), tx)); 167 } 168 169 void 170 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 171 { 172 dmu_object_info_t doi; 173 zap_cursor_t zc; 174 zap_attribute_t za; 175 176 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 177 if (doi.doi_type == DMU_OT_BPOBJ) { 178 bpobj_free(os, dlobj, tx); 179 return; 180 } 181 182 for (zap_cursor_init(&zc, os, dlobj); 183 zap_cursor_retrieve(&zc, &za) == 0; 184 zap_cursor_advance(&zc)) { 185 uint64_t obj = za.za_first_integer; 186 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 187 bpobj_decr_empty(os, tx); 188 else 189 bpobj_free(os, obj, tx); 190 } 191 zap_cursor_fini(&zc); 192 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 193 } 194 195 static void 196 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 197 const blkptr_t *bp, dmu_tx_t *tx) 198 { 199 ASSERT(MUTEX_HELD(&dl->dl_lock)); 200 if (dle->dle_bpobj.bpo_object == 201 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 202 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 203 bpobj_close(&dle->dle_bpobj); 204 bpobj_decr_empty(dl->dl_os, tx); 205 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 206 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 207 dle->dle_mintxg, obj, tx)); 208 } 209 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 210 } 211 212 static void 213 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 214 uint64_t obj, dmu_tx_t *tx) 215 { 216 ASSERT(MUTEX_HELD(&dl->dl_lock)); 217 if (dle->dle_bpobj.bpo_object != 218 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 219 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 220 } else { 221 bpobj_close(&dle->dle_bpobj); 222 bpobj_decr_empty(dl->dl_os, tx); 223 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 224 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 225 dle->dle_mintxg, obj, tx)); 226 } 227 } 228 229 void 230 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 231 { 232 dsl_deadlist_entry_t dle_tofind; 233 dsl_deadlist_entry_t *dle; 234 avl_index_t where; 235 236 if (dl->dl_oldfmt) { 237 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 238 return; 239 } 240 241 mutex_enter(&dl->dl_lock); 242 dsl_deadlist_load_tree(dl); 243 244 dmu_buf_will_dirty(dl->dl_dbuf, tx); 245 dl->dl_phys->dl_used += 246 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 247 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 248 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 249 250 dle_tofind.dle_mintxg = bp->blk_birth; 251 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 252 if (dle == NULL) 253 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 254 else 255 dle = AVL_PREV(&dl->dl_tree, dle); 256 dle_enqueue(dl, dle, bp, tx); 257 mutex_exit(&dl->dl_lock); 258 } 259 260 /* 261 * Insert new key in deadlist, which must be > all current entries. 262 * mintxg is not inclusive. 263 */ 264 void 265 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 266 { 267 uint64_t obj; 268 dsl_deadlist_entry_t *dle; 269 270 if (dl->dl_oldfmt) 271 return; 272 273 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 274 dle->dle_mintxg = mintxg; 275 276 mutex_enter(&dl->dl_lock); 277 dsl_deadlist_load_tree(dl); 278 279 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 280 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 281 avl_add(&dl->dl_tree, dle); 282 283 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 284 mintxg, obj, tx)); 285 mutex_exit(&dl->dl_lock); 286 } 287 288 /* 289 * Remove this key, merging its entries into the previous key. 290 */ 291 void 292 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 293 { 294 dsl_deadlist_entry_t dle_tofind; 295 dsl_deadlist_entry_t *dle, *dle_prev; 296 297 if (dl->dl_oldfmt) 298 return; 299 300 mutex_enter(&dl->dl_lock); 301 dsl_deadlist_load_tree(dl); 302 303 dle_tofind.dle_mintxg = mintxg; 304 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 305 dle_prev = AVL_PREV(&dl->dl_tree, dle); 306 307 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 308 309 avl_remove(&dl->dl_tree, dle); 310 bpobj_close(&dle->dle_bpobj); 311 kmem_free(dle, sizeof (*dle)); 312 313 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 314 mutex_exit(&dl->dl_lock); 315 } 316 317 /* 318 * Walk ds's snapshots to regenerate generate ZAP & AVL. 319 */ 320 static void 321 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 322 uint64_t mrs_obj, dmu_tx_t *tx) 323 { 324 dsl_deadlist_t dl = { 0 }; 325 dsl_pool_t *dp = dmu_objset_pool(os); 326 327 dsl_deadlist_open(&dl, os, dlobj); 328 if (dl.dl_oldfmt) { 329 dsl_deadlist_close(&dl); 330 return; 331 } 332 333 while (mrs_obj != 0) { 334 dsl_dataset_t *ds; 335 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 336 dsl_deadlist_add_key(&dl, 337 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 338 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 339 dsl_dataset_rele(ds, FTAG); 340 } 341 dsl_deadlist_close(&dl); 342 } 343 344 uint64_t 345 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 346 uint64_t mrs_obj, dmu_tx_t *tx) 347 { 348 dsl_deadlist_entry_t *dle; 349 uint64_t newobj; 350 351 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 352 353 if (dl->dl_oldfmt) { 354 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 355 return (newobj); 356 } 357 358 mutex_enter(&dl->dl_lock); 359 dsl_deadlist_load_tree(dl); 360 361 for (dle = avl_first(&dl->dl_tree); dle; 362 dle = AVL_NEXT(&dl->dl_tree, dle)) { 363 uint64_t obj; 364 365 if (dle->dle_mintxg >= maxtxg) 366 break; 367 368 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 369 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 370 dle->dle_mintxg, obj, tx)); 371 } 372 mutex_exit(&dl->dl_lock); 373 return (newobj); 374 } 375 376 void 377 dsl_deadlist_space(dsl_deadlist_t *dl, 378 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 379 { 380 ASSERT(dsl_deadlist_is_open(dl)); 381 if (dl->dl_oldfmt) { 382 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 383 usedp, compp, uncompp)); 384 return; 385 } 386 387 mutex_enter(&dl->dl_lock); 388 *usedp = dl->dl_phys->dl_used; 389 *compp = dl->dl_phys->dl_comp; 390 *uncompp = dl->dl_phys->dl_uncomp; 391 mutex_exit(&dl->dl_lock); 392 } 393 394 /* 395 * return space used in the range (mintxg, maxtxg]. 396 * Includes maxtxg, does not include mintxg. 397 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 398 * larger than any bp in the deadlist (eg. UINT64_MAX)). 399 */ 400 void 401 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 402 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 403 { 404 dsl_deadlist_entry_t *dle; 405 dsl_deadlist_entry_t dle_tofind; 406 avl_index_t where; 407 408 if (dl->dl_oldfmt) { 409 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 410 mintxg, maxtxg, usedp, compp, uncompp)); 411 return; 412 } 413 414 *usedp = *compp = *uncompp = 0; 415 416 mutex_enter(&dl->dl_lock); 417 dsl_deadlist_load_tree(dl); 418 dle_tofind.dle_mintxg = mintxg; 419 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 420 /* 421 * If we don't find this mintxg, there shouldn't be anything 422 * after it either. 423 */ 424 ASSERT(dle != NULL || 425 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 426 427 for (; dle && dle->dle_mintxg < maxtxg; 428 dle = AVL_NEXT(&dl->dl_tree, dle)) { 429 uint64_t used, comp, uncomp; 430 431 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 432 &used, &comp, &uncomp)); 433 434 *usedp += used; 435 *compp += comp; 436 *uncompp += uncomp; 437 } 438 mutex_exit(&dl->dl_lock); 439 } 440 441 static void 442 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 443 dmu_tx_t *tx) 444 { 445 dsl_deadlist_entry_t dle_tofind; 446 dsl_deadlist_entry_t *dle; 447 avl_index_t where; 448 uint64_t used, comp, uncomp; 449 bpobj_t bpo; 450 451 ASSERT(MUTEX_HELD(&dl->dl_lock)); 452 453 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 454 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 455 bpobj_close(&bpo); 456 457 dsl_deadlist_load_tree(dl); 458 459 dmu_buf_will_dirty(dl->dl_dbuf, tx); 460 dl->dl_phys->dl_used += used; 461 dl->dl_phys->dl_comp += comp; 462 dl->dl_phys->dl_uncomp += uncomp; 463 464 dle_tofind.dle_mintxg = birth; 465 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 466 if (dle == NULL) 467 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 468 dle_enqueue_subobj(dl, dle, obj, tx); 469 } 470 471 static int 472 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 473 { 474 dsl_deadlist_t *dl = arg; 475 dsl_deadlist_insert(dl, bp, tx); 476 return (0); 477 } 478 479 /* 480 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 481 * an empty deadlist. 482 */ 483 void 484 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 485 { 486 zap_cursor_t zc; 487 zap_attribute_t za; 488 dmu_buf_t *bonus; 489 dsl_deadlist_phys_t *dlp; 490 dmu_object_info_t doi; 491 492 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 493 if (doi.doi_type == DMU_OT_BPOBJ) { 494 bpobj_t bpo; 495 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 496 VERIFY3U(0, ==, bpobj_iterate(&bpo, 497 dsl_deadlist_insert_cb, dl, tx)); 498 bpobj_close(&bpo); 499 return; 500 } 501 502 mutex_enter(&dl->dl_lock); 503 for (zap_cursor_init(&zc, dl->dl_os, obj); 504 zap_cursor_retrieve(&zc, &za) == 0; 505 zap_cursor_advance(&zc)) { 506 uint64_t mintxg = zfs_strtonum(za.za_name, NULL); 507 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 508 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 509 } 510 zap_cursor_fini(&zc); 511 512 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 513 dlp = bonus->db_data; 514 dmu_buf_will_dirty(bonus, tx); 515 bzero(dlp, sizeof (*dlp)); 516 dmu_buf_rele(bonus, FTAG); 517 mutex_exit(&dl->dl_lock); 518 } 519 520 /* 521 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 522 */ 523 void 524 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 525 dmu_tx_t *tx) 526 { 527 dsl_deadlist_entry_t dle_tofind; 528 dsl_deadlist_entry_t *dle; 529 avl_index_t where; 530 531 ASSERT(!dl->dl_oldfmt); 532 533 mutex_enter(&dl->dl_lock); 534 dmu_buf_will_dirty(dl->dl_dbuf, tx); 535 dsl_deadlist_load_tree(dl); 536 537 dle_tofind.dle_mintxg = mintxg; 538 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 539 if (dle == NULL) 540 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 541 while (dle) { 542 uint64_t used, comp, uncomp; 543 dsl_deadlist_entry_t *dle_next; 544 545 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 546 547 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 548 &used, &comp, &uncomp)); 549 ASSERT3U(dl->dl_phys->dl_used, >=, used); 550 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 551 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 552 dl->dl_phys->dl_used -= used; 553 dl->dl_phys->dl_comp -= comp; 554 dl->dl_phys->dl_uncomp -= uncomp; 555 556 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 557 dle->dle_mintxg, tx)); 558 559 dle_next = AVL_NEXT(&dl->dl_tree, dle); 560 avl_remove(&dl->dl_tree, dle); 561 bpobj_close(&dle->dle_bpobj); 562 kmem_free(dle, sizeof (*dle)); 563 dle = dle_next; 564 } 565 mutex_exit(&dl->dl_lock); 566 } 567