/[gentoo-x86]/sys-fs/zfs/files/zfs-0.6.0_rc9-range-lock-caller-allocate.patch
Gentoo

Contents of /sys-fs/zfs/files/zfs-0.6.0_rc9-range-lock-caller-allocate.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations) (download)
Mon Jun 25 20:21:55 2012 UTC (3 years ago) by ryao
Branch: MAIN
Changes since 1.2: +113 -67 lines
Fix regression caused by patch to fix deadlock

(Portage version: 2.1.10.49/cvs/Linux x86_64)

1 From fc1f1d3940f4d2e5b1b85481d900d8198cf4b6f3 Mon Sep 17 00:00:00 2001
2 From: Richard Yao <ryao@cs.stonybrook.edu>
3 Date: Mon, 25 Jun 2012 14:41:30 -0400
4 Subject: [PATCH] Make callers responsible for memory allocation in
5 zfs_range_lock()
6
7 zfs_range_lock() is used in zvols, and previously, it could deadlock due
8 to an allocation using KM_SLEEP. We avoid this by moving responsibility
9 the memory allocation from zfs_range_lock() to the caller. This enables
10 us to avoid such deadlocks and use stack allocations, which are more
11 efficient and prevents deadlocks. The contexts in which stack
12 allocations are done do not appear to be stack heavy, so we do not risk
13 overflowing the stack from doing this.
14
15 Signed-off-by: Richard Yao <ryao@cs.stonybrook.edu>
16
17 Conflicts:
18
19 module/zfs/zvol.c
20 ---
21 cmd/ztest/ztest.c | 32 +++++++++++++++++---------------
22 include/sys/zfs_rlock.h | 2 +-
23 module/zfs/zfs_rlock.c | 15 +++++++--------
24 module/zfs/zfs_vnops.c | 30 ++++++++++++++++--------------
25 module/zfs/zfs_znode.c | 30 +++++++++++++++---------------
26 module/zfs/zvol.c | 24 +++++++++++++-----------
27 6 files changed, 69 insertions(+), 64 deletions(-)
28
29 diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
30 index 72d511b..c5dd0c2 100644
31 --- a/cmd/ztest/ztest.c
32 +++ b/cmd/ztest/ztest.c
33 @@ -973,12 +973,11 @@ enum ztest_object {
34 }
35
36 static rl_t *
37 -ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
38 +ztest_range_lock(rl_t *rl, ztest_ds_t *zd, uint64_t object, uint64_t offset,
39 uint64_t size, rl_type_t type)
40 {
41 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
42 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
43 - rl_t *rl;
44
45 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
46 rl->rl_object = object;
47 @@ -1389,7 +1388,7 @@ enum ztest_object {
48 dmu_tx_t *tx;
49 dmu_buf_t *db;
50 arc_buf_t *abuf = NULL;
51 - rl_t *rl;
52 + rl_t rl;
53
54 if (byteswap)
55 byteswap_uint64_array(lr, sizeof (*lr));
56 @@ -1413,7 +1412,7 @@ enum ztest_object {
57 bt = NULL;
58
59 ztest_object_lock(zd, lr->lr_foid, RL_READER);
60 - rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
61 + ztest_range_lock(&rl, zd, lr->lr_foid, offset, length, RL_WRITER);
62
63 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
64
65 @@ -1438,7 +1437,7 @@ enum ztest_object {
66 if (abuf != NULL)
67 dmu_return_arcbuf(abuf);
68 dmu_buf_rele(db, FTAG);
69 - ztest_range_unlock(rl);
70 + ztest_range_unlock(&rl);
71 ztest_object_unlock(zd, lr->lr_foid);
72 return (ENOSPC);
73 }
74 @@ -1495,7 +1494,7 @@ enum ztest_object {
75
76 dmu_tx_commit(tx);
77
78 - ztest_range_unlock(rl);
79 + ztest_range_unlock(&rl);
80 ztest_object_unlock(zd, lr->lr_foid);
81
82 return (0);
83 @@ -1507,13 +1506,13 @@ enum ztest_object {
84 objset_t *os = zd->zd_os;
85 dmu_tx_t *tx;
86 uint64_t txg;
87 - rl_t *rl;
88 + rl_t rl;
89
90 if (byteswap)
91 byteswap_uint64_array(lr, sizeof (*lr));
92
93 ztest_object_lock(zd, lr->lr_foid, RL_READER);
94 - rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
95 + ztest_range_lock(&rl, zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
96 RL_WRITER);
97
98 tx = dmu_tx_create(os);
99 @@ -1522,7 +1521,7 @@ enum ztest_object {
100
101 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
102 if (txg == 0) {
103 - ztest_range_unlock(rl);
104 + ztest_range_unlock(&rl);
105 ztest_object_unlock(zd, lr->lr_foid);
106 return (ENOSPC);
107 }
108 @@ -1534,7 +1533,7 @@ enum ztest_object {
109
110 dmu_tx_commit(tx);
111
112 - ztest_range_unlock(rl);
113 + ztest_range_unlock(&rl);
114 ztest_object_unlock(zd, lr->lr_foid);
115
116 return (0);
117 @@ -1670,6 +1669,8 @@ enum ztest_object {
118 dmu_object_info_t doi;
119 dmu_buf_t *db;
120 zgd_t *zgd;
121 + rl_t rl;
122 +
123 int error;
124
125 ztest_object_lock(zd, object, RL_READER);
126 @@ -1694,9 +1695,10 @@ enum ztest_object {
127 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
128 zgd->zgd_zilog = zd->zd_zilog;
129 zgd->zgd_private = zd;
130 + zgd->zgd_rl = &rl;
131
132 if (buf != NULL) { /* immediate write */
133 - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
134 + ztest_range_lock(zgd->zgd_rl, zd, object, offset, size,
135 RL_READER);
136
137 error = dmu_read(os, object, offset, size, buf,
138 @@ -1711,7 +1713,7 @@ enum ztest_object {
139 offset = 0;
140 }
141
142 - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
143 + ztest_range_lock(zgd->zgd_rl, zd, object, offset, size,
144 RL_READER);
145
146 error = dmu_buf_hold(os, object, offset, zgd, &db,
147 @@ -1953,12 +1955,12 @@ enum ztest_object {
148 objset_t *os = zd->zd_os;
149 dmu_tx_t *tx;
150 uint64_t txg;
151 - rl_t *rl;
152 + rl_t rl;
153
154 txg_wait_synced(dmu_objset_pool(os), 0);
155
156 ztest_object_lock(zd, object, RL_READER);
157 - rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
158 + ztest_range_lock(&rl, zd, object, offset, size, RL_WRITER);
159
160 tx = dmu_tx_create(os);
161
162 @@ -1974,7 +1976,7 @@ enum ztest_object {
163 (void) dmu_free_long_range(os, object, offset, size);
164 }
165
166 - ztest_range_unlock(rl);
167 + ztest_range_unlock(&rl);
168 ztest_object_unlock(zd, object);
169 }
170
171 diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h
172 index da18b1f..85dc16a 100644
173 --- a/include/sys/zfs_rlock.h
174 +++ b/include/sys/zfs_rlock.h
175 @@ -63,7 +63,7 @@
176 * is converted to WRITER that specified to lock from the start of the
177 * end of file. zfs_range_lock() returns the range lock structure.
178 */
179 -rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
180 +rl_t *zfs_range_lock(rl_t *rl, znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
181
182 /*
183 * Unlock range and destroy range lock structure.
184 diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c
185 index f3ada17..eb81777 100644
186 --- a/module/zfs/zfs_rlock.c
187 +++ b/module/zfs/zfs_rlock.c
188 @@ -31,9 +31,9 @@
189 * Interface
190 * ---------
191 * Defined in zfs_rlock.h but essentially:
192 - * rl = zfs_range_lock(zp, off, len, lock_type);
193 - * zfs_range_unlock(rl);
194 - * zfs_range_reduce(rl, off, len);
195 + * zfs_range_lock(&rl, zp, off, len, lock_type);
196 + * zfs_range_unlock(&rl);
197 + * zfs_range_reduce(&rl, off, len);
198 *
199 * AVL tree
200 * --------
201 @@ -420,13 +420,11 @@
202 * previously locked as RL_WRITER).
203 */
204 rl_t *
205 -zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
206 +zfs_range_lock(rl_t *new, znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
207 {
208 - rl_t *new;
209
210 ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
211
212 - new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
213 new->r_zp = zp;
214 new->r_off = off;
215 if (len + off < off) /* overflow */
216 @@ -531,7 +529,6 @@
217 }
218
219 mutex_exit(&zp->z_range_lock);
220 - kmem_free(remove, sizeof (rl_t));
221 }
222 }
223
224 @@ -572,7 +569,9 @@
225
226 while ((free_rl = list_head(&free_list)) != NULL) {
227 list_remove(&free_list, free_rl);
228 - zfs_range_free(free_rl);
229 + /* Freeing rl is the caller's responsibility */
230 + if (free_rl != rl)
231 + zfs_range_free(free_rl);
232 }
233
234 list_destroy(&free_list);
235 diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
236 index 2da5fec..c8ca7c5 100644
237 --- a/module/zfs/zfs_vnops.c
238 +++ b/module/zfs/zfs_vnops.c
239 @@ -370,7 +370,7 @@
240 objset_t *os;
241 ssize_t n, nbytes;
242 int error = 0;
243 - rl_t *rl;
244 + rl_t rl;
245 #ifdef HAVE_UIO_ZEROCOPY
246 xuio_t *xuio = NULL;
247 #endif /* HAVE_UIO_ZEROCOPY */
248 @@ -418,7 +418,7 @@
249 /*
250 * Lock the range against changes.
251 */
252 - rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
253 + zfs_range_lock(&rl, zp, uio->uio_loffset, uio->uio_resid, RL_READER);
254
255 /*
256 * If we are reading past end-of-file we can skip
257 @@ -482,7 +482,7 @@
258 n -= nbytes;
259 }
260 out:
261 - zfs_range_unlock(rl);
262 + zfs_range_unlock(&rl);
263
264 ZFS_ACCESSTIME_STAMP(zsb, zp);
265 zfs_inode_update(zp);
266 @@ -524,7 +524,7 @@
267 zilog_t *zilog;
268 offset_t woff;
269 ssize_t n, nbytes;
270 - rl_t *rl;
271 + rl_t rl;
272 int max_blksz = zsb->z_max_blksz;
273 int error = 0;
274 arc_buf_t *abuf;
275 @@ -608,9 +608,9 @@
276 * Obtain an appending range lock to guarantee file append
277 * semantics. We reset the write offset once we have the lock.
278 */
279 - rl = zfs_range_lock(zp, 0, n, RL_APPEND);
280 - woff = rl->r_off;
281 - if (rl->r_len == UINT64_MAX) {
282 + zfs_range_lock(&rl, zp, 0, n, RL_APPEND);
283 + woff = rl.r_off;
284 + if (rl.r_len == UINT64_MAX) {
285 /*
286 * We overlocked the file because this write will cause
287 * the file block size to increase.
288 @@ -625,11 +625,11 @@
289 * this write, then this range lock will lock the entire file
290 * so that we can re-write the block safely.
291 */
292 - rl = zfs_range_lock(zp, woff, n, RL_WRITER);
293 + zfs_range_lock(&rl, zp, woff, n, RL_WRITER);
294 }
295
296 if (woff >= limit) {
297 - zfs_range_unlock(rl);
298 + zfs_range_unlock(&rl);
299 ZFS_EXIT(zsb);
300 return (EFBIG);
301 }
302 @@ -719,7 +719,7 @@
303 * on the first iteration since zfs_range_reduce() will
304 * shrink down r_len to the appropriate size.
305 */
306 - if (rl->r_len == UINT64_MAX) {
307 + if (rl.r_len == UINT64_MAX) {
308 uint64_t new_blksz;
309
310 if (zp->z_blksz > max_blksz) {
311 @@ -729,7 +729,7 @@
312 new_blksz = MIN(end_size, max_blksz);
313 }
314 zfs_grow_blocksize(zp, new_blksz, tx);
315 - zfs_range_reduce(rl, woff, n);
316 + zfs_range_reduce(&rl, woff, n);
317 }
318
319 /*
320 @@ -842,7 +842,7 @@
321 uio_prefaultpages(MIN(n, max_blksz), uio);
322 }
323
324 - zfs_range_unlock(rl);
325 + zfs_range_unlock(&rl);
326
327 /*
328 * If we're in replay mode, or we made no progress, return error.
329 @@ -915,6 +915,7 @@
330 blkptr_t *bp = &lr->lr_blkptr;
331 dmu_buf_t *db;
332 zgd_t *zgd;
333 + rl_t rl;
334 int error = 0;
335
336 ASSERT(zio != NULL);
337 @@ -935,6 +936,7 @@
338 }
339
340 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
341 + zgd->zgd_rl = &rl;
342 zgd->zgd_zilog = zsb->z_log;
343 zgd->zgd_private = zp;
344
345 @@ -946,7 +948,7 @@
346 * we don't have to write the data twice.
347 */
348 if (buf != NULL) { /* immediate write */
349 - zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
350 + zfs_range_lock(zgd->zgd_rl, zp, offset, size, RL_READER);
351 /* test for truncation needs to be done while range locked */
352 if (offset >= zp->z_size) {
353 error = ENOENT;
354 @@ -967,7 +969,7 @@
355 size = zp->z_blksz;
356 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
357 offset -= blkoff;
358 - zgd->zgd_rl = zfs_range_lock(zp, offset, size,
359 + zfs_range_lock(zgd->zgd_rl, zp, offset, size,
360 RL_READER);
361 if (zp->z_blksz == size)
362 break;
363 diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
364 index 3a6872f..e363839 100644
365 --- a/module/zfs/zfs_znode.c
366 +++ b/module/zfs/zfs_znode.c
367 @@ -1158,20 +1158,20 @@
368 {
369 zfs_sb_t *zsb = ZTOZSB(zp);
370 dmu_tx_t *tx;
371 - rl_t *rl;
372 + rl_t rl;
373 uint64_t newblksz;
374 int error;
375
376 /*
377 * We will change zp_size, lock the whole file.
378 */
379 - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
380 + zfs_range_lock(&rl, zp, 0, UINT64_MAX, RL_WRITER);
381
382 /*
383 * Nothing to do if file already at desired length.
384 */
385 if (end <= zp->z_size) {
386 - zfs_range_unlock(rl);
387 + zfs_range_unlock(&rl);
388 return (0);
389 }
390 top:
391 @@ -1202,7 +1202,7 @@
392 goto top;
393 }
394 dmu_tx_abort(tx);
395 - zfs_range_unlock(rl);
396 + zfs_range_unlock(&rl);
397 return (error);
398 }
399
400 @@ -1214,7 +1214,7 @@
401 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
402 &zp->z_size, sizeof (zp->z_size), tx));
403
404 - zfs_range_unlock(rl);
405 + zfs_range_unlock(&rl);
406
407 dmu_tx_commit(tx);
408
409 @@ -1235,19 +1235,19 @@
410 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
411 {
412 zfs_sb_t *zsb = ZTOZSB(zp);
413 - rl_t *rl;
414 + rl_t rl;
415 int error;
416
417 /*
418 * Lock the range being freed.
419 */
420 - rl = zfs_range_lock(zp, off, len, RL_WRITER);
421 + zfs_range_lock(&rl, zp, off, len, RL_WRITER);
422
423 /*
424 * Nothing to do if file already at desired length.
425 */
426 if (off >= zp->z_size) {
427 - zfs_range_unlock(rl);
428 + zfs_range_unlock(&rl);
429 return (0);
430 }
431
432 @@ -1256,7 +1256,7 @@
433
434 error = dmu_free_long_range(zsb->z_os, zp->z_id, off, len);
435
436 - zfs_range_unlock(rl);
437 + zfs_range_unlock(&rl);
438
439 return (error);
440 }
441 @@ -1275,7 +1275,7 @@
442 {
443 zfs_sb_t *zsb = ZTOZSB(zp);
444 dmu_tx_t *tx;
445 - rl_t *rl;
446 + rl_t rl;
447 int error;
448 sa_bulk_attr_t bulk[2];
449 int count = 0;
450 @@ -1283,19 +1283,19 @@
451 /*
452 * We will change zp_size, lock the whole file.
453 */
454 - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
455 + zfs_range_lock(&rl, zp, 0, UINT64_MAX, RL_WRITER);
456
457 /*
458 * Nothing to do if file already at desired length.
459 */
460 if (end >= zp->z_size) {
461 - zfs_range_unlock(rl);
462 + zfs_range_unlock(&rl);
463 return (0);
464 }
465
466 error = dmu_free_long_range(zsb->z_os, zp->z_id, end, -1);
467 if (error) {
468 - zfs_range_unlock(rl);
469 + zfs_range_unlock(&rl);
470 return (error);
471 }
472 top:
473 @@ -1310,7 +1310,7 @@
474 goto top;
475 }
476 dmu_tx_abort(tx);
477 - zfs_range_unlock(rl);
478 + zfs_range_unlock(&rl);
479 return (error);
480 }
481
482 @@ -1327,7 +1327,7 @@
483
484 dmu_tx_commit(tx);
485
486 - zfs_range_unlock(rl);
487 + zfs_range_unlock(&rl);
488
489 return (0);
490 }
491 diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
492 index 125d58d..bbe53d9 100644
493 --- a/module/zfs/zvol.c
494 +++ b/module/zfs/zvol.c
495 @@ -537,7 +537,7 @@
496 uint64_t size = blk_rq_bytes(req);
497 int error = 0;
498 dmu_tx_t *tx;
499 - rl_t *rl;
500 + rl_t rl;
501
502 if (req->cmd_flags & VDEV_REQ_FLUSH)
503 zil_commit(zv->zv_zilog, ZVOL_OBJ);
504 @@ -550,7 +550,7 @@
505 return;
506 }
507
508 - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
509 + zfs_range_lock(&rl, &zv->zv_znode, offset, size, RL_WRITER);
510
511 tx = dmu_tx_create(zv->zv_objset);
512 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size);
513 @@ -559,7 +559,7 @@
514 error = dmu_tx_assign(tx, TXG_WAIT);
515 if (error) {
516 dmu_tx_abort(tx);
517 - zfs_range_unlock(rl);
518 + zfs_range_unlock(&rl);
519 blk_end_request(req, -error, size);
520 return;
521 }
522 @@ -570,7 +570,7 @@
523 req->cmd_flags & VDEV_REQ_FUA);
524
525 dmu_tx_commit(tx);
526 - zfs_range_unlock(rl);
527 + zfs_range_unlock(&rl);
528
529 if ((req->cmd_flags & VDEV_REQ_FUA) ||
530 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
531 @@ -589,7 +589,7 @@
532 uint64_t offset = blk_rq_pos(req) << 9;
533 uint64_t size = blk_rq_bytes(req);
534 int error;
535 - rl_t *rl;
536 + rl_t rl;
537
538 if (offset + size > zv->zv_volsize) {
539 blk_end_request(req, -EIO, size);
540 @@ -601,7 +601,7 @@
541 return;
542 }
543
544 - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
545 + zfs_range_lock(&rl, &zv->zv_znode, offset, size, RL_WRITER);
546
547 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, size);
548
549 @@ -609,7 +609,7 @@
550 * TODO: maybe we should add the operation to the log.
551 */
552
553 - zfs_range_unlock(rl);
554 + zfs_range_unlock(&rl);
555
556 blk_end_request(req, -error, size);
557 }
558 @@ -630,18 +630,18 @@
559 uint64_t offset = blk_rq_pos(req) << 9;
560 uint64_t size = blk_rq_bytes(req);
561 int error;
562 - rl_t *rl;
563 + rl_t rl;
564
565 if (size == 0) {
566 blk_end_request(req, 0, size);
567 return;
568 }
569
570 - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
571 + zfs_range_lock(&rl, &zv->zv_znode, offset, size, RL_READER);
572
573 error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req);
574
575 - zfs_range_unlock(rl);
576 + zfs_range_unlock(&rl);
577
578 /* convert checksum errors into IO errors */
579 if (error == ECKSUM)
580 @@ -744,6 +744,7 @@
581 if (error == 0 && zgd->zgd_bp)
582 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
583
584 + kmem_free(zgd->zgd_rl, sizeof (rl_t));
585 kmem_free(zgd, sizeof (zgd_t));
586 }
587
588 @@ -766,7 +767,8 @@
589
590 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
591 zgd->zgd_zilog = zv->zv_zilog;
592 - zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
593 + zgd->zgd_rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
594 + zfs_range_lock(zgd->zgd_rl, &zv->zv_znode, offset, size, RL_READER);
595
596 /*
597 * Write records come in two flavors: immediate and indirect.
598 --
599 1.7.10

  ViewVC Help
Powered by ViewVC 1.1.20