commit 4bbb86db6d6da5c05c452a810f20ffb19d6918de Author: Richard Yao Date: Sun Jun 24 20:11:40 2012 -0400 Make callers responsible for memory allocation in zfs_range_lock() zfs_range_lock() is used in zvols, and previously, it could deadlock due to an allocation using KM_SLEEP. We avoid this by moving responsibility the memory allocation from zfs_range_lock() to the caller. This enables us to avoid such deadlocks and use stack allocations, which are more efficient and prevents deadlocks. The contexts in which stack allocations are done do not appear to be stack heavy, so we do not risk overflowing the stack from doing this. Signed-off-by: Richard Yao Conflicts: module/zfs/zvol.c diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 72d511b..c5dd0c2 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -973,12 +973,11 @@ ztest_object_unlock(ztest_ds_t *zd, uint64_t object) } static rl_t * -ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, +ztest_range_lock(rl_t *rl, ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, rl_type_t type) { uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; - rl_t *rl; rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); rl->rl_object = object; @@ -1389,7 +1388,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) dmu_tx_t *tx; dmu_buf_t *db; arc_buf_t *abuf = NULL; - rl_t *rl; + rl_t rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -1413,7 +1412,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) bt = NULL; ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + ztest_range_lock(&rl, zd, lr->lr_foid, offset, length, RL_WRITER); VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); @@ -1438,7 +1437,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) if (abuf != NULL) dmu_return_arcbuf(abuf); dmu_buf_rele(db, FTAG); - ztest_range_unlock(rl); + ztest_range_unlock(&rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } @@ -1495,7 +1494,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) dmu_tx_commit(tx); - ztest_range_unlock(rl); + ztest_range_unlock(&rl); ztest_object_unlock(zd, lr->lr_foid); return (0); @@ -1507,13 +1506,13 @@ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; - rl_t *rl; + rl_t rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, + ztest_range_lock(&rl, zd, lr->lr_foid, lr->lr_offset, lr->lr_length, RL_WRITER); tx = dmu_tx_create(os); @@ -1522,7 +1521,7 @@ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { - ztest_range_unlock(rl); + ztest_range_unlock(&rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } @@ -1534,7 +1533,7 @@ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) dmu_tx_commit(tx); - ztest_range_unlock(rl); + ztest_range_unlock(&rl); ztest_object_unlock(zd, lr->lr_foid); return (0); @@ -1670,6 +1669,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) dmu_object_info_t doi; dmu_buf_t *db; zgd_t *zgd; + rl_t rl; + int error; ztest_object_lock(zd, object, RL_READER); @@ -1694,9 +1695,10 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); zgd->zgd_zilog = zd->zd_zilog; zgd->zgd_private = zd; + zgd->zgd_rl = &rl; if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + ztest_range_lock(zgd->zgd_rl, zd, object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, @@ -1711,7 +1713,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) offset = 0; } - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + ztest_range_lock(zgd->zgd_rl, zd, object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, @@ -1953,12 +1955,12 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; - rl_t *rl; + rl_t rl; txg_wait_synced(dmu_objset_pool(os), 0); ztest_object_lock(zd, object, RL_READER); - rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + ztest_range_lock(&rl, zd, object, offset, size, RL_WRITER); tx = dmu_tx_create(os); @@ -1974,7 +1976,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) (void) dmu_free_long_range(os, object, offset, size); } - ztest_range_unlock(rl); + ztest_range_unlock(&rl); ztest_object_unlock(zd, object); } diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h index da18b1f..85dc16a 100644 --- a/include/sys/zfs_rlock.h +++ b/include/sys/zfs_rlock.h @@ -63,7 +63,7 @@ typedef struct rl { * is converted to WRITER that specified to lock from the start of the * end of file. zfs_range_lock() returns the range lock structure. */ -rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); +rl_t *zfs_range_lock(rl_t *rl, znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); /* * Unlock range and destroy range lock structure. diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c index f3ada17..6e9afc0 100644 --- a/module/zfs/zfs_rlock.c +++ b/module/zfs/zfs_rlock.c @@ -31,9 +31,9 @@ * Interface * --------- * Defined in zfs_rlock.h but essentially: - * rl = zfs_range_lock(zp, off, len, lock_type); - * zfs_range_unlock(rl); - * zfs_range_reduce(rl, off, len); + * zfs_range_lock(&rl, zp, off, len, lock_type); + * zfs_range_unlock(&rl); + * zfs_range_reduce(&rl, off, len); * * AVL tree * -------- @@ -420,13 +420,11 @@ got_lock: * previously locked as RL_WRITER). */ rl_t * -zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +zfs_range_lock(rl_t *new, znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) { - rl_t *new; ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); - new = kmem_alloc(sizeof (rl_t), KM_SLEEP); new->r_zp = zp; new->r_off = off; if (len + off < off) /* overflow */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 2da5fec..039269a 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -370,7 +370,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) objset_t *os; ssize_t n, nbytes; int error = 0; - rl_t *rl; + rl_t rl; #ifdef HAVE_UIO_ZEROCOPY xuio_t *xuio = NULL; #endif /* HAVE_UIO_ZEROCOPY */ @@ -418,7 +418,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) /* * Lock the range against changes. */ - rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); + zfs_range_lock(&rl, zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip @@ -482,7 +482,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) n -= nbytes; } out: - zfs_range_unlock(rl); + zfs_range_unlock(&rl); ZFS_ACCESSTIME_STAMP(zsb, zp); zfs_inode_update(zp); @@ -524,7 +524,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) zilog_t *zilog; offset_t woff; ssize_t n, nbytes; - rl_t *rl; + rl_t rl; int max_blksz = zsb->z_max_blksz; int error = 0; arc_buf_t *abuf; @@ -608,9 +608,9 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ - rl = zfs_range_lock(zp, 0, n, RL_APPEND); - woff = rl->r_off; - if (rl->r_len == UINT64_MAX) { + zfs_range_lock(&rl, zp, 0, n, RL_APPEND); + woff = rl.r_off; + if (rl.r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. @@ -625,11 +625,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ - rl = zfs_range_lock(zp, woff, n, RL_WRITER); + zfs_range_lock(&rl, zp, woff, n, RL_WRITER); } if (woff >= limit) { - zfs_range_unlock(rl); + zfs_range_unlock(&rl); ZFS_EXIT(zsb); return (EFBIG); } @@ -719,7 +719,7 @@ again: * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ - if (rl->r_len == UINT64_MAX) { + if (rl.r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { @@ -729,7 +729,7 @@ again: new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); - zfs_range_reduce(rl, woff, n); + zfs_range_reduce(&rl, woff, n); } /* @@ -842,7 +842,7 @@ again: uio_prefaultpages(MIN(n, max_blksz), uio); } - zfs_range_unlock(rl); + zfs_range_unlock(&rl); /* * If we're in replay mode, or we made no progress, return error. @@ -946,7 +946,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); + zfs_range_lock(zgd->zgd_rl, zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = ENOENT; @@ -967,7 +967,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; - zgd->zgd_rl = zfs_range_lock(zp, offset, size, + zfs_range_lock(zgd->zgd_rl, zp, offset, size, RL_READER); if (zp->z_blksz == size) break; diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 3a6872f..e363839 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -1158,20 +1158,20 @@ zfs_extend(znode_t *zp, uint64_t end) { zfs_sb_t *zsb = ZTOZSB(zp); dmu_tx_t *tx; - rl_t *rl; + rl_t rl; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + zfs_range_lock(&rl, zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (0); } top: @@ -1202,7 +1202,7 @@ top: goto top; } dmu_tx_abort(tx); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (error); } @@ -1214,7 +1214,7 @@ top: VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); dmu_tx_commit(tx); @@ -1235,19 +1235,19 @@ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfs_sb_t *zsb = ZTOZSB(zp); - rl_t *rl; + rl_t rl; int error; /* * Lock the range being freed. */ - rl = zfs_range_lock(zp, off, len, RL_WRITER); + zfs_range_lock(&rl, zp, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (0); } @@ -1256,7 +1256,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) error = dmu_free_long_range(zsb->z_os, zp->z_id, off, len); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (error); } @@ -1275,7 +1275,7 @@ zfs_trunc(znode_t *zp, uint64_t end) { zfs_sb_t *zsb = ZTOZSB(zp); dmu_tx_t *tx; - rl_t *rl; + rl_t rl; int error; sa_bulk_attr_t bulk[2]; int count = 0; @@ -1283,19 +1283,19 @@ zfs_trunc(znode_t *zp, uint64_t end) /* * We will change zp_size, lock the whole file. */ - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + zfs_range_lock(&rl, zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (0); } error = dmu_free_long_range(zsb->z_os, zp->z_id, end, -1); if (error) { - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (error); } top: @@ -1310,7 +1310,7 @@ top: goto top; } dmu_tx_abort(tx); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (error); } @@ -1327,7 +1327,7 @@ top: dmu_tx_commit(tx); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); return (0); } diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 125d58d..5cae597 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -537,7 +537,7 @@ zvol_write(void *arg) uint64_t size = blk_rq_bytes(req); int error = 0; dmu_tx_t *tx; - rl_t *rl; + rl_t rl; if (req->cmd_flags & VDEV_REQ_FLUSH) zil_commit(zv->zv_zilog, ZVOL_OBJ); @@ -550,7 +550,7 @@ zvol_write(void *arg) return; } - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); + zfs_range_lock(&rl, &zv->zv_znode, offset, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); @@ -559,7 +559,7 @@ zvol_write(void *arg) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); blk_end_request(req, -error, size); return; } @@ -570,7 +570,7 @@ zvol_write(void *arg) req->cmd_flags & VDEV_REQ_FUA); dmu_tx_commit(tx); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); if ((req->cmd_flags & VDEV_REQ_FUA) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) @@ -589,7 +589,7 @@ zvol_discard(void *arg) uint64_t offset = blk_rq_pos(req) << 9; uint64_t size = blk_rq_bytes(req); int error; - rl_t *rl; + rl_t rl; if (offset + size > zv->zv_volsize) { blk_end_request(req, -EIO, size); @@ -601,7 +601,7 @@ zvol_discard(void *arg) return; } - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); + zfs_range_lock(&rl, &zv->zv_znode, offset, size, RL_WRITER); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, size); @@ -609,7 +609,7 @@ zvol_discard(void *arg) * TODO: maybe we should add the operation to the log. */ - zfs_range_unlock(rl); + zfs_range_unlock(&rl); blk_end_request(req, -error, size); } @@ -630,18 +630,18 @@ zvol_read(void *arg) uint64_t offset = blk_rq_pos(req) << 9; uint64_t size = blk_rq_bytes(req); int error; - rl_t *rl; + rl_t rl; if (size == 0) { blk_end_request(req, 0, size); return; } - rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); + zfs_range_lock(&rl, &zv->zv_znode, offset, size, RL_READER); error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req); - zfs_range_unlock(rl); + zfs_range_unlock(&rl); /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -744,6 +744,7 @@ zvol_get_done(zgd_t *zgd, int error) if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + kmem_free(zgd->zgd_rl, sizeof(rl_t)); kmem_free(zgd, sizeof (zgd_t)); } @@ -766,7 +767,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; - zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); + zgd->zgd_rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); + zfs_range_lock(zgd->zgd_rl, &zv->zv_znode, offset, size, RL_READER); /* * Write records come in two flavors: immediate and indirect.