xref: /illumos-gate/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c (revision b6805bf78d2bbbeeaea8909a05623587b42d58b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/conf.h>
26 #include <sys/file.h>
27 #include <sys/ddi.h>
28 #include <sys/sunddi.h>
29 #include <sys/modctl.h>
30 #include <sys/scsi/scsi.h>
31 #include <sys/scsi/impl/scsi_reset_notify.h>
32 #include <sys/scsi/generic/mode.h>
33 #include <sys/disp.h>
34 #include <sys/byteorder.h>
35 #include <sys/atomic.h>
36 #include <sys/sdt.h>
37 #include <sys/dkio.h>
38 #include <sys/dmu.h>
39 #include <sys/arc.h>
40 #include <sys/zvol.h>
41 #include <sys/zfs_rlock.h>
42 
43 #include <sys/stmf.h>
44 #include <sys/lpif.h>
45 #include <sys/portif.h>
46 #include <sys/stmf_ioctl.h>
47 #include <sys/stmf_sbd_ioctl.h>
48 
49 #include "stmf_sbd.h"
50 #include "sbd_impl.h"
51 
52 
53 /*
54  * This file contains direct calls into the zfs module.
55  * These functions mimic zvol_read and zvol_write except pointers
56  * to the data buffers are passed instead of copying the data itself.
57  *
58  * zfs internal interfaces referenced here:
59  *
60  * FUNCTIONS
61  *    dmu_buf_hold_array_by_bonus()
62  *    dmu_buf_rele_array()
63  *
64  *    dmu_request_arc_buf()
65  *    dmu_assign_arcbuf()
66  *    dmu_return_arc()
67  *    arc_buf_size()
68  *
69  *    dmu_tx_create()
70  *    dmu_tx_hold_write()
71  *    dmu_tx_assign()
72  *    dmu_tx_commit(tx)
73  *    dmu_tx_abort(tx)
74  *    zil_commit()
75  *
76  *    zfs_range_lock()
77  *    zfs_range_unlock()
78  *
79  *    zvol_log_write()
80  *
81  *    dmu_read_uio()
82  *    dmu_write_uio()
83  * MINOR DATA
84  *    zv_volsize
85  *    zv_volblocksize
86  *    zv_flags		- for WCE
87  *    zv_objset		- dmu_tx_create
88  *    zv_zilog		- zil_commit
89  *    zv_znode		- zfs_range_lock
90  *    zv_dbuf		- dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
91  * GLOBAL DATA
92  *    zvol_maxphys
93  */
94 
95 /*
96  * Take direct control of the volume instead of using the driver
97  * interfaces provided by zvol.c. Gather parameters and handles
98  * needed to make direct calls into zfs/dmu/zvol. The driver is
99  * opened exclusively at this point, so these parameters cannot change.
100  *
101  * NOTE: the object size and WCE can change while the device
102  * is open, so they must be fetched for every operation.
103  */
104 int
105 sbd_zvol_get_volume_params(sbd_lu_t *sl)
106 {
107 	int ret;
108 
109 	ret = zvol_get_volume_params(sl->sl_zvol_minor,
110 	    &sl->sl_blksize,		/* volume block size */
111 	    &sl->sl_max_xfer_len,	/* max data chunk size */
112 	    &sl->sl_zvol_minor_hdl,	/* minor soft state */
113 	    &sl->sl_zvol_objset_hdl,	/* dmu_tx_create */
114 	    &sl->sl_zvol_zil_hdl,	/* zil_commit */
115 	    &sl->sl_zvol_rl_hdl,	/* zfs_range_lock */
116 	    &sl->sl_zvol_bonus_hdl);	/* dmu_buf_hold_array_by_bonus, */
117 					/* dmu_request_arcbuf, */
118 					/* dmu_assign_arcbuf */
119 
120 	if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) {
121 		cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to "
122 		    "small zvol blocksize (%d)\n", (int)sl->sl_blksize);
123 		ret = ENOTSUP;
124 	}
125 
126 	return (ret);
127 }
128 
129 /*
130  * Return the number of elements in a scatter/gather list required for
131  * the given span in the zvol. Elements are 1:1 with zvol blocks.
132  */
133 uint32_t
134 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len)
135 {
136 	uint64_t blksz = sl->sl_blksize;
137 	uint64_t endoff = off + len;
138 	uint64_t numsegs;
139 
140 	numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz;
141 	return ((uint32_t)numsegs);
142 }
143 
144 /*
145  * Return an array of dmu_buf_t pointers for the requested range.
146  * The dmu buffers are either in cache or read in synchronously.
147  * Fill in the dbuf sglist from the dmu_buf_t array.
148  */
149 static void *RDTAG = "sbd_zvol_read";
150 
151 int
152 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
153 {
154 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
155 	rl_t 		*rl;
156 	int 		numbufs, error;
157 	uint64_t 	len = dbuf->db_data_size;
158 	uint64_t 	offset = zvio->zvio_offset;
159 	dmu_buf_t	**dbpp, *dbp;
160 
161 	/* Make sure request is reasonable */
162 	if (len > sl->sl_max_xfer_len)
163 		return (E2BIG);
164 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
165 		return (EIO);
166 
167 	/*
168 	 * The range lock is only held until the dmu buffers read in and
169 	 * held; not during the callers use of the data.
170 	 */
171 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
172 
173 	error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset,
174 	    len, TRUE, RDTAG, &numbufs, &dbpp);
175 
176 	zfs_range_unlock(rl);
177 
178 	if (error == ECKSUM)
179 		error = EIO;
180 
181 	if (error == 0) {
182 		/*
183 		 * Fill in db_sglist from the dmu_buf_t array.
184 		 */
185 		int		i;
186 		stmf_sglist_ent_t *sgl;
187 		uint64_t	odiff, seglen;
188 
189 		zvio->zvio_dbp = dbpp;
190 		/* make sure db_sglist is large enough */
191 		if (dbuf->db_sglist_length != numbufs) {
192 			cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
193 			    dbuf->db_sglist_length, numbufs);
194 		}
195 
196 		sgl = &dbuf->db_sglist[0];
197 		for (i = 0; i < numbufs; i++) {
198 			dbp = dbpp[i];
199 			odiff =  offset - dbp->db_offset;
200 			ASSERT(odiff == 0 || i == 0);
201 			sgl->seg_addr = (uint8_t *)dbp->db_data + odiff;
202 			seglen = MIN(len, dbp->db_size - odiff);
203 			sgl->seg_length = (uint32_t)seglen;
204 			offset += seglen;
205 			len -= seglen;
206 			sgl++;
207 		}
208 		ASSERT(len == 0);
209 
210 	}
211 	return (error);
212 }
213 
214 /*
215  * Release a dmu_buf_t array.
216  */
217 /*ARGSUSED*/
218 void
219 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
220 {
221 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
222 
223 	ASSERT(zvio->zvio_dbp);
224 	ASSERT(dbuf->db_sglist_length);
225 
226 	dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG);
227 }
228 
229 /*
230  * Allocate enough loaned arc buffers for the requested region.
231  * Mimic the handling of the dmu_buf_t array used for reads as closely
232  * as possible even though the arc_buf_t's are anonymous until released.
233  * The buffers will match the zvol object blocks sizes and alignments
234  * such that a data copy may be avoided when the buffers are assigned.
235  */
236 int
237 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
238 {
239 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
240 	int		blkshift, numbufs, i;
241 	uint64_t	blksize;
242 	arc_buf_t	**abp;
243 	stmf_sglist_ent_t *sgl;
244 	uint64_t 	len = dbuf->db_data_size;
245 	uint64_t 	offset = zvio->zvio_offset;
246 
247 	/* Make sure request is reasonable */
248 	if (len > sl->sl_max_xfer_len)
249 		return (E2BIG);
250 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
251 		return (EIO);
252 
253 	/*
254 	 * Break up the request into chunks to match
255 	 * the volume block size. Only full, and aligned
256 	 * buffers will avoid the data copy in the dmu.
257 	 */
258 	/*
259 	 * calculate how may dbufs are needed
260 	 */
261 	blksize = sl->sl_blksize;
262 	ASSERT(ISP2(blksize));
263 	blkshift = highbit(blksize - 1);
264 	/*
265 	 * taken from dmu_buf_hold_array_by_dnode()
266 	 */
267 	numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) -
268 	    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
269 	if (dbuf->db_sglist_length != numbufs) {
270 		cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
271 		    dbuf->db_sglist_length, numbufs);
272 	}
273 	/*
274 	 * allocate a holder for the needed arc_buf pointers
275 	 */
276 	abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP);
277 	/*
278 	 * The write operation uses loaned arc buffers so that
279 	 * the xfer_data is done outside of a dmu transaction.
280 	 * These buffers will exactly match the request unlike
281 	 * the dmu buffers obtained from the read operation.
282 	 */
283 	/*
284 	 * allocate the arc buffers and fill in the stmf sglist
285 	 */
286 	sgl = &dbuf->db_sglist[0];
287 	for (i = 0; i < numbufs; i++) {
288 		uint64_t seglen;
289 
290 		/* first block may not be aligned */
291 		seglen = P2NPHASE(offset, blksize);
292 		if (seglen == 0)
293 			seglen = blksize;
294 		seglen = MIN(seglen, len);
295 		abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen);
296 		ASSERT(arc_buf_size(abp[i]) == (int)seglen);
297 		sgl->seg_addr = abp[i]->b_data;
298 		sgl->seg_length = (uint32_t)seglen;
299 		sgl++;
300 		offset += seglen;
301 		len -= seglen;
302 	}
303 	ASSERT(len == 0);
304 
305 	zvio->zvio_abp = abp;
306 	return (0);
307 }
308 
309 /*ARGSUSED*/
310 void
311 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
312 {
313 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
314 	int i;
315 	arc_buf_t **abp = zvio->zvio_abp;
316 
317 	/* free arcbufs */
318 	for (i = 0; i < dbuf->db_sglist_length; i++)
319 		dmu_return_arcbuf(*abp++);
320 	kmem_free(zvio->zvio_abp,
321 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
322 	zvio->zvio_abp = NULL;
323 }
324 
325 /*
326  * Release the arc_buf_t array allocated above and handle these cases :
327  *
328  * flags == 0 - create transaction and assign all arc bufs to offsets
329  * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices
330  */
331 int
332 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
333 {
334 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
335 	dmu_tx_t	*tx;
336 	int		sync, i, error;
337 	rl_t 		*rl;
338 	arc_buf_t	**abp = zvio->zvio_abp;
339 	int		flags = zvio->zvio_flags;
340 	uint64_t	toffset, offset = zvio->zvio_offset;
341 	uint64_t	resid, len = dbuf->db_data_size;
342 
343 	ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
344 
345 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
346 
347 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
348 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
349 	error = dmu_tx_assign(tx, TXG_WAIT);
350 
351 	if (error) {
352 		dmu_tx_abort(tx);
353 		zfs_range_unlock(rl);
354 		sbd_zvol_rele_write_bufs_abort(sl, dbuf);
355 		return (error);
356 	}
357 
358 	toffset = offset;
359 	resid = len;
360 	for (i = 0; i < dbuf->db_sglist_length; i++) {
361 		arc_buf_t *abuf;
362 		int size;
363 
364 		abuf = abp[i];
365 		size = arc_buf_size(abuf);
366 		dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx);
367 		toffset += size;
368 		resid -= size;
369 	}
370 	ASSERT(resid == 0);
371 
372 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
373 	zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
374 	    (ssize_t)len, sync);
375 	dmu_tx_commit(tx);
376 	zfs_range_unlock(rl);
377 	kmem_free(zvio->zvio_abp,
378 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
379 	zvio->zvio_abp = NULL;
380 	if (sync && (flags & ZVIO_COMMIT))
381 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
382 	return (0);
383 }
384 
385 /*
386  * Copy interface for callers using direct zvol access.
387  * Very similar to zvol_read but the uio may have multiple iovec entries.
388  */
389 int
390 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
391 {
392 	int		error;
393 	rl_t 		*rl;
394 	uint64_t	len = (uint64_t)uio->uio_resid;
395 	uint64_t	offset = (uint64_t)uio->uio_loffset;
396 
397 	/* Make sure request is reasonable */
398 	if (len > sl->sl_max_xfer_len)
399 		return (E2BIG);
400 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
401 		return (EIO);
402 
403 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
404 
405 	error =  dmu_read_uio(sl->sl_zvol_objset_hdl, ZVOL_OBJ, uio, len);
406 
407 	zfs_range_unlock(rl);
408 	if (error == ECKSUM)
409 		error = EIO;
410 	return (error);
411 }
412 
413 /*
414  * Copy interface for callers using direct zvol access.
415  * Very similar to zvol_write but the uio may have multiple iovec entries.
416  */
417 int
418 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
419 {
420 	rl_t 		*rl;
421 	dmu_tx_t 	*tx;
422 	int		error, sync;
423 	uint64_t	len = (uint64_t)uio->uio_resid;
424 	uint64_t	offset = (uint64_t)uio->uio_loffset;
425 
426 	ASSERT(flags == 0 || flags == ZVIO_COMMIT);
427 
428 	/* Make sure request is reasonable */
429 	if (len > sl->sl_max_xfer_len)
430 		return (E2BIG);
431 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
432 		return (EIO);
433 
434 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
435 
436 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
437 
438 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
439 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid);
440 	error = dmu_tx_assign(tx, TXG_WAIT);
441 	if (error) {
442 		dmu_tx_abort(tx);
443 	} else {
444 		/*
445 		 * XXX use the new bonus handle entry.
446 		 */
447 		error = dmu_write_uio(sl->sl_zvol_objset_hdl, ZVOL_OBJ,
448 		    uio, len, tx);
449 		if (error == 0) {
450 			zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
451 			    (ssize_t)len, sync);
452 		}
453 		dmu_tx_commit(tx);
454 	}
455 	zfs_range_unlock(rl);
456 	if (sync && (flags & ZVIO_COMMIT))
457 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
458 	if (error == ECKSUM)
459 		error = EIO;
460 	return (error);
461 }
462