xref: /illumos-gate/usr/src/uts/common/fs/zfs/rrwlock.c (revision bde2df36223c26750e6e5e801907d885e088ee30)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2012 by Delphix. All rights reserved.
27  */
28 
29 #include <sys/refcount.h>
30 #include <sys/rrwlock.h>
31 
32 /*
33  * This file contains the implementation of a re-entrant read
34  * reader/writer lock (aka "rrwlock").
35  *
36  * This is a normal reader/writer lock with the additional feature
37  * of allowing threads who have already obtained a read lock to
38  * re-enter another read lock (re-entrant read) - even if there are
39  * waiting writers.
40  *
41  * Callers who have not obtained a read lock give waiting writers priority.
42  *
43  * The rrwlock_t lock does not allow re-entrant writers, nor does it
44  * allow a re-entrant mix of reads and writes (that is, it does not
45  * allow a caller who has already obtained a read lock to be able to
46  * then grab a write lock without first dropping all read locks, and
47  * vice versa).
48  *
49  * The rrwlock_t uses tsd (thread specific data) to keep a list of
50  * nodes (rrw_node_t), where each node keeps track of which specific
51  * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
52  * should be rare, a thread that grabs multiple reads on the same rrwlock_t
53  * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
54  * tsd list can represent a different rrwlock_t.  This allows a thread
55  * to enter multiple and unique rrwlock_ts for read locks at the same time.
56  *
57  * Since using tsd exposes some overhead, the rrwlock_t only needs to
58  * keep tsd data when writers are waiting.  If no writers are waiting, then
59  * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
60  * is needed.  Once a writer attempts to grab the lock, readers then
61  * keep tsd data and bump the linked readers count (rr_linked_rcount).
62  *
63  * If there are waiting writers and there are anonymous readers, then a
64  * reader doesn't know if it is a re-entrant lock. But since it may be one,
65  * we allow the read to proceed (otherwise it could deadlock).  Since once
66  * waiting writers are active, readers no longer bump the anonymous count,
67  * the anonymous readers will eventually flush themselves out.  At this point,
68  * readers will be able to tell if they are a re-entrant lock (have a
69  * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
70  * we must let the proceed.  If they are not, then the reader blocks for the
71  * waiting writers.  Hence, we do not starve writers.
72  */
73 
74 /* global key for TSD */
75 uint_t rrw_tsd_key;
76 
77 typedef struct rrw_node {
78 	struct rrw_node	*rn_next;
79 	rrwlock_t	*rn_rrl;
80 } rrw_node_t;
81 
82 static rrw_node_t *
83 rrn_find(rrwlock_t *rrl)
84 {
85 	rrw_node_t *rn;
86 
87 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
88 		return (NULL);
89 
90 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
91 		if (rn->rn_rrl == rrl)
92 			return (rn);
93 	}
94 	return (NULL);
95 }
96 
97 /*
98  * Add a node to the head of the singly linked list.
99  */
100 static void
101 rrn_add(rrwlock_t *rrl)
102 {
103 	rrw_node_t *rn;
104 
105 	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
106 	rn->rn_rrl = rrl;
107 	rn->rn_next = tsd_get(rrw_tsd_key);
108 	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
109 }
110 
111 /*
112  * If a node is found for 'rrl', then remove the node from this
113  * thread's list and return TRUE; otherwise return FALSE.
114  */
115 static boolean_t
116 rrn_find_and_remove(rrwlock_t *rrl)
117 {
118 	rrw_node_t *rn;
119 	rrw_node_t *prev = NULL;
120 
121 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
122 		return (B_FALSE);
123 
124 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
125 		if (rn->rn_rrl == rrl) {
126 			if (prev)
127 				prev->rn_next = rn->rn_next;
128 			else
129 				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
130 			kmem_free(rn, sizeof (*rn));
131 			return (B_TRUE);
132 		}
133 		prev = rn;
134 	}
135 	return (B_FALSE);
136 }
137 
138 void
139 rrw_init(rrwlock_t *rrl)
140 {
141 	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
142 	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
143 	rrl->rr_writer = NULL;
144 	refcount_create(&rrl->rr_anon_rcount);
145 	refcount_create(&rrl->rr_linked_rcount);
146 	rrl->rr_writer_wanted = B_FALSE;
147 }
148 
149 void
150 rrw_destroy(rrwlock_t *rrl)
151 {
152 	mutex_destroy(&rrl->rr_lock);
153 	cv_destroy(&rrl->rr_cv);
154 	ASSERT(rrl->rr_writer == NULL);
155 	refcount_destroy(&rrl->rr_anon_rcount);
156 	refcount_destroy(&rrl->rr_linked_rcount);
157 }
158 
159 static void
160 rrw_enter_read(rrwlock_t *rrl, void *tag)
161 {
162 	mutex_enter(&rrl->rr_lock);
163 #if !defined(DEBUG) && defined(_KERNEL)
164 	if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
165 		rrl->rr_anon_rcount.rc_count++;
166 		mutex_exit(&rrl->rr_lock);
167 		return;
168 	}
169 	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
170 #endif
171 	ASSERT(rrl->rr_writer != curthread);
172 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
173 
174 	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
175 	    refcount_is_zero(&rrl->rr_anon_rcount) &&
176 	    rrn_find(rrl) == NULL))
177 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
178 
179 	if (rrl->rr_writer_wanted) {
180 		/* may or may not be a re-entrant enter */
181 		rrn_add(rrl);
182 		(void) refcount_add(&rrl->rr_linked_rcount, tag);
183 	} else {
184 		(void) refcount_add(&rrl->rr_anon_rcount, tag);
185 	}
186 	ASSERT(rrl->rr_writer == NULL);
187 	mutex_exit(&rrl->rr_lock);
188 }
189 
190 static void
191 rrw_enter_write(rrwlock_t *rrl)
192 {
193 	mutex_enter(&rrl->rr_lock);
194 	ASSERT(rrl->rr_writer != curthread);
195 
196 	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
197 	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
198 	    rrl->rr_writer != NULL) {
199 		rrl->rr_writer_wanted = B_TRUE;
200 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
201 	}
202 	rrl->rr_writer_wanted = B_FALSE;
203 	rrl->rr_writer = curthread;
204 	mutex_exit(&rrl->rr_lock);
205 }
206 
207 void
208 rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
209 {
210 	if (rw == RW_READER)
211 		rrw_enter_read(rrl, tag);
212 	else
213 		rrw_enter_write(rrl);
214 }
215 
216 void
217 rrw_exit(rrwlock_t *rrl, void *tag)
218 {
219 	mutex_enter(&rrl->rr_lock);
220 #if !defined(DEBUG) && defined(_KERNEL)
221 	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
222 		rrl->rr_anon_rcount.rc_count--;
223 		if (rrl->rr_anon_rcount.rc_count == 0)
224 			cv_broadcast(&rrl->rr_cv);
225 		mutex_exit(&rrl->rr_lock);
226 		return;
227 	}
228 	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
229 #endif
230 	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
231 	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
232 	    rrl->rr_writer != NULL);
233 
234 	if (rrl->rr_writer == NULL) {
235 		int64_t count;
236 		if (rrn_find_and_remove(rrl))
237 			count = refcount_remove(&rrl->rr_linked_rcount, tag);
238 		else
239 			count = refcount_remove(&rrl->rr_anon_rcount, tag);
240 		if (count == 0)
241 			cv_broadcast(&rrl->rr_cv);
242 	} else {
243 		ASSERT(rrl->rr_writer == curthread);
244 		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
245 		    refcount_is_zero(&rrl->rr_linked_rcount));
246 		rrl->rr_writer = NULL;
247 		cv_broadcast(&rrl->rr_cv);
248 	}
249 	mutex_exit(&rrl->rr_lock);
250 }
251 
252 boolean_t
253 rrw_held(rrwlock_t *rrl, krw_t rw)
254 {
255 	boolean_t held;
256 
257 	mutex_enter(&rrl->rr_lock);
258 	if (rw == RW_WRITER) {
259 		held = (rrl->rr_writer == curthread);
260 	} else {
261 		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
262 		    !refcount_is_zero(&rrl->rr_linked_rcount));
263 	}
264 	mutex_exit(&rrl->rr_lock);
265 
266 	return (held);
267 }
268 
269 void
270 rrw_tsd_destroy(void *arg)
271 {
272 	rrw_node_t *rn = arg;
273 	if (rn != NULL) {
274 		panic("thread %p terminating with rrw lock %p held",
275 		    (void *)curthread, (void *)rn->rn_rrl);
276 	}
277 }
278