1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/dsl_dataset.h> 29 #include <sys/dmu.h> 30 #include <sys/refcount.h> 31 #include <sys/zap.h> 32 #include <sys/zfs_context.h> 33 #include <sys/dsl_pool.h> 34 35 /* 36 * Deadlist concurrency: 37 * 38 * Deadlists can only be modified from the syncing thread. 39 * 40 * Except for dsl_deadlist_insert(), it can only be modified with the 41 * dp_config_rwlock held with RW_WRITER. 42 * 43 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 44 * be called concurrently, from open context, with the dl_config_rwlock held 45 * with RW_READER. 46 * 47 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 48 * the accessors, protecting: 49 * dl_phys->dl_used,comp,uncomp 50 * and protecting the dl_tree from being loaded. 51 * The locking is provided by dl_lock. Note that locking on the bpobj_t 52 * provides its own locking, and dl_oldfmt is immutable. 53 */ 54 55 static int 56 dsl_deadlist_compare(const void *arg1, const void *arg2) 57 { 58 const dsl_deadlist_entry_t *dle1 = arg1; 59 const dsl_deadlist_entry_t *dle2 = arg2; 60 61 if (dle1->dle_mintxg < dle2->dle_mintxg) 62 return (-1); 63 else if (dle1->dle_mintxg > dle2->dle_mintxg) 64 return (+1); 65 else 66 return (0); 67 } 68 69 static void 70 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 71 { 72 zap_cursor_t zc; 73 zap_attribute_t za; 74 75 ASSERT(MUTEX_HELD(&dl->dl_lock)); 76 77 ASSERT(!dl->dl_oldfmt); 78 if (dl->dl_havetree) 79 return; 80 81 avl_create(&dl->dl_tree, dsl_deadlist_compare, 82 sizeof (dsl_deadlist_entry_t), 83 offsetof(dsl_deadlist_entry_t, dle_node)); 84 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 85 zap_cursor_retrieve(&zc, &za) == 0; 86 zap_cursor_advance(&zc)) { 87 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 88 dle->dle_mintxg = strtonum(za.za_name, NULL); 89 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 90 za.za_first_integer)); 91 avl_add(&dl->dl_tree, dle); 92 } 93 zap_cursor_fini(&zc); 94 dl->dl_havetree = B_TRUE; 95 } 96 97 void 98 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 99 { 100 dmu_object_info_t doi; 101 102 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 103 dl->dl_os = os; 104 dl->dl_object = object; 105 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 106 dmu_object_info_from_db(dl->dl_dbuf, &doi); 107 if (doi.doi_type == DMU_OT_BPOBJ) { 108 dmu_buf_rele(dl->dl_dbuf, dl); 109 dl->dl_dbuf = NULL; 110 dl->dl_oldfmt = B_TRUE; 111 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 112 return; 113 } 114 115 dl->dl_oldfmt = B_FALSE; 116 dl->dl_phys = dl->dl_dbuf->db_data; 117 dl->dl_havetree = B_FALSE; 118 } 119 120 void 121 dsl_deadlist_close(dsl_deadlist_t *dl) 122 { 123 void *cookie = NULL; 124 dsl_deadlist_entry_t *dle; 125 126 dl->dl_os = NULL; 127 128 if (dl->dl_oldfmt) { 129 dl->dl_oldfmt = B_FALSE; 130 bpobj_close(&dl->dl_bpobj); 131 return; 132 } 133 134 if (dl->dl_havetree) { 135 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 136 != NULL) { 137 bpobj_close(&dle->dle_bpobj); 138 kmem_free(dle, sizeof (*dle)); 139 } 140 avl_destroy(&dl->dl_tree); 141 } 142 dmu_buf_rele(dl->dl_dbuf, dl); 143 mutex_destroy(&dl->dl_lock); 144 dl->dl_dbuf = NULL; 145 dl->dl_phys = NULL; 146 } 147 148 uint64_t 149 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 150 { 151 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 152 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 153 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 154 sizeof (dsl_deadlist_phys_t), tx)); 155 } 156 157 void 158 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 159 { 160 dmu_object_info_t doi; 161 zap_cursor_t zc; 162 zap_attribute_t za; 163 164 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 165 if (doi.doi_type == DMU_OT_BPOBJ) { 166 bpobj_free(os, dlobj, tx); 167 return; 168 } 169 170 for (zap_cursor_init(&zc, os, dlobj); 171 zap_cursor_retrieve(&zc, &za) == 0; 172 zap_cursor_advance(&zc)) { 173 uint64_t obj = za.za_first_integer; 174 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 175 bpobj_decr_empty(os, tx); 176 else 177 bpobj_free(os, obj, tx); 178 } 179 zap_cursor_fini(&zc); 180 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 181 } 182 183 static void 184 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 185 const blkptr_t *bp, dmu_tx_t *tx) 186 { 187 ASSERT(MUTEX_HELD(&dl->dl_lock)); 188 if (dle->dle_bpobj.bpo_object == 189 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 190 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 191 bpobj_close(&dle->dle_bpobj); 192 bpobj_decr_empty(dl->dl_os, tx); 193 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 194 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 195 dle->dle_mintxg, obj, tx)); 196 } 197 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 198 } 199 200 static void 201 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 202 uint64_t obj, dmu_tx_t *tx) 203 { 204 ASSERT(MUTEX_HELD(&dl->dl_lock)); 205 if (dle->dle_bpobj.bpo_object != 206 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 207 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 208 } else { 209 bpobj_close(&dle->dle_bpobj); 210 bpobj_decr_empty(dl->dl_os, tx); 211 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 212 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 213 dle->dle_mintxg, obj, tx)); 214 } 215 } 216 217 void 218 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 219 { 220 dsl_deadlist_entry_t dle_tofind; 221 dsl_deadlist_entry_t *dle; 222 avl_index_t where; 223 224 if (dl->dl_oldfmt) { 225 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 226 return; 227 } 228 229 mutex_enter(&dl->dl_lock); 230 dsl_deadlist_load_tree(dl); 231 232 dmu_buf_will_dirty(dl->dl_dbuf, tx); 233 dl->dl_phys->dl_used += 234 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 235 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 236 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 237 238 dle_tofind.dle_mintxg = bp->blk_birth; 239 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 240 if (dle == NULL) 241 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 242 else 243 dle = AVL_PREV(&dl->dl_tree, dle); 244 dle_enqueue(dl, dle, bp, tx); 245 mutex_exit(&dl->dl_lock); 246 } 247 248 /* 249 * Insert new key in deadlist, which must be > all current entries. 250 * mintxg is not inclusive. 251 */ 252 void 253 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 254 { 255 uint64_t obj; 256 dsl_deadlist_entry_t *dle; 257 258 if (dl->dl_oldfmt) 259 return; 260 261 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 262 dle->dle_mintxg = mintxg; 263 264 mutex_enter(&dl->dl_lock); 265 dsl_deadlist_load_tree(dl); 266 267 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 268 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 269 avl_add(&dl->dl_tree, dle); 270 271 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 272 mintxg, obj, tx)); 273 mutex_exit(&dl->dl_lock); 274 } 275 276 /* 277 * Remove this key, merging its entries into the previous key. 278 */ 279 void 280 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 281 { 282 dsl_deadlist_entry_t dle_tofind; 283 dsl_deadlist_entry_t *dle, *dle_prev; 284 285 if (dl->dl_oldfmt) 286 return; 287 288 mutex_enter(&dl->dl_lock); 289 dsl_deadlist_load_tree(dl); 290 291 dle_tofind.dle_mintxg = mintxg; 292 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 293 dle_prev = AVL_PREV(&dl->dl_tree, dle); 294 295 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 296 297 avl_remove(&dl->dl_tree, dle); 298 bpobj_close(&dle->dle_bpobj); 299 kmem_free(dle, sizeof (*dle)); 300 301 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 302 mutex_exit(&dl->dl_lock); 303 } 304 305 /* 306 * Walk ds's snapshots to regenerate generate ZAP & AVL. 307 */ 308 static void 309 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 310 uint64_t mrs_obj, dmu_tx_t *tx) 311 { 312 dsl_deadlist_t dl; 313 dsl_pool_t *dp = dmu_objset_pool(os); 314 315 dsl_deadlist_open(&dl, os, dlobj); 316 if (dl.dl_oldfmt) { 317 dsl_deadlist_close(&dl); 318 return; 319 } 320 321 while (mrs_obj != 0) { 322 dsl_dataset_t *ds; 323 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 324 dsl_deadlist_add_key(&dl, 325 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 326 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 327 dsl_dataset_rele(ds, FTAG); 328 } 329 dsl_deadlist_close(&dl); 330 } 331 332 uint64_t 333 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 334 uint64_t mrs_obj, dmu_tx_t *tx) 335 { 336 dsl_deadlist_entry_t *dle; 337 uint64_t newobj; 338 339 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 340 341 if (dl->dl_oldfmt) { 342 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 343 return (newobj); 344 } 345 346 mutex_enter(&dl->dl_lock); 347 dsl_deadlist_load_tree(dl); 348 349 for (dle = avl_first(&dl->dl_tree); dle; 350 dle = AVL_NEXT(&dl->dl_tree, dle)) { 351 uint64_t obj; 352 353 if (dle->dle_mintxg >= maxtxg) 354 break; 355 356 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 357 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 358 dle->dle_mintxg, obj, tx)); 359 } 360 mutex_exit(&dl->dl_lock); 361 return (newobj); 362 } 363 364 void 365 dsl_deadlist_space(dsl_deadlist_t *dl, 366 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 367 { 368 if (dl->dl_oldfmt) { 369 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 370 usedp, compp, uncompp)); 371 return; 372 } 373 374 mutex_enter(&dl->dl_lock); 375 *usedp = dl->dl_phys->dl_used; 376 *compp = dl->dl_phys->dl_comp; 377 *uncompp = dl->dl_phys->dl_uncomp; 378 mutex_exit(&dl->dl_lock); 379 } 380 381 /* 382 * return space used in the range (mintxg, maxtxg]. 383 * Includes maxtxg, does not include mintxg. 384 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 385 * larger than any bp in the deadlist (eg. UINT64_MAX)). 386 */ 387 void 388 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 389 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 390 { 391 dsl_deadlist_entry_t *dle; 392 dsl_deadlist_entry_t dle_tofind; 393 avl_index_t where; 394 395 if (dl->dl_oldfmt) { 396 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 397 mintxg, maxtxg, usedp, compp, uncompp)); 398 return; 399 } 400 401 *usedp = *compp = *uncompp = 0; 402 403 mutex_enter(&dl->dl_lock); 404 dsl_deadlist_load_tree(dl); 405 dle_tofind.dle_mintxg = mintxg; 406 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 407 /* 408 * If we don't find this mintxg, there shouldn't be anything 409 * after it either. 410 */ 411 ASSERT(dle != NULL || 412 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 413 414 for (; dle && dle->dle_mintxg < maxtxg; 415 dle = AVL_NEXT(&dl->dl_tree, dle)) { 416 uint64_t used, comp, uncomp; 417 418 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 419 &used, &comp, &uncomp)); 420 421 *usedp += used; 422 *compp += comp; 423 *uncompp += uncomp; 424 } 425 mutex_exit(&dl->dl_lock); 426 } 427 428 static void 429 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 430 dmu_tx_t *tx) 431 { 432 dsl_deadlist_entry_t dle_tofind; 433 dsl_deadlist_entry_t *dle; 434 avl_index_t where; 435 uint64_t used, comp, uncomp; 436 bpobj_t bpo; 437 438 ASSERT(MUTEX_HELD(&dl->dl_lock)); 439 440 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 441 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 442 bpobj_close(&bpo); 443 444 dsl_deadlist_load_tree(dl); 445 446 dmu_buf_will_dirty(dl->dl_dbuf, tx); 447 dl->dl_phys->dl_used += used; 448 dl->dl_phys->dl_comp += comp; 449 dl->dl_phys->dl_uncomp += uncomp; 450 451 dle_tofind.dle_mintxg = birth; 452 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 453 if (dle == NULL) 454 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 455 dle_enqueue_subobj(dl, dle, obj, tx); 456 } 457 458 static int 459 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 460 { 461 dsl_deadlist_t *dl = arg; 462 dsl_deadlist_insert(dl, bp, tx); 463 return (0); 464 } 465 466 /* 467 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 468 * an empty deadlist. 469 */ 470 void 471 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 472 { 473 zap_cursor_t zc; 474 zap_attribute_t za; 475 dmu_buf_t *bonus; 476 dsl_deadlist_phys_t *dlp; 477 dmu_object_info_t doi; 478 479 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 480 if (doi.doi_type == DMU_OT_BPOBJ) { 481 bpobj_t bpo; 482 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 483 VERIFY3U(0, ==, bpobj_iterate(&bpo, 484 dsl_deadlist_insert_cb, dl, tx)); 485 bpobj_close(&bpo); 486 return; 487 } 488 489 mutex_enter(&dl->dl_lock); 490 for (zap_cursor_init(&zc, dl->dl_os, obj); 491 zap_cursor_retrieve(&zc, &za) == 0; 492 zap_cursor_advance(&zc)) { 493 uint64_t mintxg = strtonum(za.za_name, NULL); 494 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 495 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 496 } 497 zap_cursor_fini(&zc); 498 499 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 500 dlp = bonus->db_data; 501 dmu_buf_will_dirty(bonus, tx); 502 bzero(dlp, sizeof (*dlp)); 503 dmu_buf_rele(bonus, FTAG); 504 mutex_exit(&dl->dl_lock); 505 } 506 507 /* 508 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 509 */ 510 void 511 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 512 dmu_tx_t *tx) 513 { 514 dsl_deadlist_entry_t dle_tofind; 515 dsl_deadlist_entry_t *dle; 516 avl_index_t where; 517 518 ASSERT(!dl->dl_oldfmt); 519 520 mutex_enter(&dl->dl_lock); 521 dmu_buf_will_dirty(dl->dl_dbuf, tx); 522 dsl_deadlist_load_tree(dl); 523 524 dle_tofind.dle_mintxg = mintxg; 525 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 526 if (dle == NULL) 527 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 528 while (dle) { 529 uint64_t used, comp, uncomp; 530 dsl_deadlist_entry_t *dle_next; 531 532 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 533 534 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 535 &used, &comp, &uncomp)); 536 ASSERT3U(dl->dl_phys->dl_used, >=, used); 537 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 538 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 539 dl->dl_phys->dl_used -= used; 540 dl->dl_phys->dl_comp -= comp; 541 dl->dl_phys->dl_uncomp -= uncomp; 542 543 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 544 dle->dle_mintxg, tx)); 545 546 dle_next = AVL_NEXT(&dl->dl_tree, dle); 547 avl_remove(&dl->dl_tree, dle); 548 bpobj_close(&dle->dle_bpobj); 549 kmem_free(dle, sizeof (*dle)); 550 dle = dle_next; 551 } 552 mutex_exit(&dl->dl_lock); 553 } 554