From 59f2bee0477456519d410ab8c9203bd3f4a95474 Mon Sep 17 00:00:00 2001 From: Fabian Keil Date: Tue, 27 Oct 2015 23:09:03 +0100 Subject: [PATCH] ZFS: Optionally let spa_sync() wait until at least one vdev is writable ... before calling dsl_pool_sync(). Significantly reduces the chances that a temporary loss of all vdevs results in a spa_namespace_lock deadlock that causes the pool to become inaccesible until the next reboot if the failmode is "wait" (default). The deadlock is caused by spa_sync() holding the required locks to fault or clear the pool while waiting for the sync operation to complete, thus preventing other parts of the system to get the pool in a usable state again. As a side effect of the deadlock, various zfs and zpool commands can lock up as well. Obtained from: ElectroBSD Deadlock example: 6 102135 zfskern txg_thread_enter mi_switch+0xde sleepq_wait+0x3a _cv_wait+0x1a6 zio_wait+0x89 dsl_pool_sync+0x12e spa_sync+0x51f txg_sync_thread+0x408 fork_exit+0x9a fork_trampoline+0xe 5985 101966 zpool - mi_switch+0xde sleepq_wait+0x3a _sx_xlock_hard+0x540 _sx_xlock+0x5d spa_get_errlog_size+0x3a spa_get_stats+0x116 zfs_ioc_pool_stats+0x4c zfsdev_ioctl+0xc68 devfs_ioctl_f+0x13b kern_ioctl+0x401 sys_ioctl+0x153 amd64_syscall+0x3e7 Xfast_syscall+0xfb 6863 101673 zpool - mi_switch+0xde sleepq_wait+0x3a _cv_wait+0x1a6 zio_wait+0x89 dbuf_read+0x464 dmu_buf_hold+0x9a zap_get_leaf_byblk+0x76 zap_deref_leaf+0xc6 fzap_cursor_retrieve+0x169 zap_cursor_retrieve+0x216 process_error_log+0xb1 spa_get_errlog+0xae zfs_ioc_error_log+0x70 zfsdev_ioctl+0xc68 devfs_ioctl_f+0x13b kern_ioctl+0x401 sys_ioctl+0x153 amd64_syscall+0x3e7 6885 100784 zpool - mi_switch+0xde sleepq_wait+0x3a _sx_xlock_hard+0x540 _sx_xlock+0x5d spa_get_errlog_size+0x3a spa_get_stats+0x116 zfs_ioc_pool_stats+0x4c zfsdev_ioctl+0xc68 devfs_ioctl_f+0x13b kern_ioctl+0x401 sys_ioctl+0x153 amd64_syscall+0x3e7 Xfast_syscall+0xfb See also: https://lists.freebsd.org/pipermail/freebsd-current/2014-September/052018.html Log excerpt with the sysctl enabled (single-disk pool): Oct 28 12:27:03 r500 kernel: [415] GEOM_ELI: g_eli_read_done() failed (error=5) label/prot1.eli[READ(offset=4654291968, length=9216)] Oct 28 12:27:03 r500 kernel: [415] da0 at umass-sim0 bus 0 scbus2 target 0 lun 0 Oct 28 12:27:03 r500 kernel: [415] da0: s/n 22B39DCC detached Oct 28 12:27:03 r500 kernel: [415] GEOM_ELI: Device label/prot1.eli destroyed. Oct 28 12:27:03 r500 kernel: [415] GEOM_ELI: Detached label/prot1.eli on last close. Oct 28 12:27:03 r500 kernel: [415] (da0:umass-sim0:0:0:0): Periph destroyed Oct 28 12:27:03 r500 ZFS: vdev is removed, pool_guid=2852801396507945726 vdev_guid=18325185749120546863 Oct 28 12:27:04 r500 kernel: [416] spa_sync: No writable vdev for prot1. Oct 28 12:27:05 r500 kernel: [417] spa_sync: No writable vdev for prot1. Oct 28 12:27:06 r500 kernel: [418] spa_sync: No writable vdev for prot1. Oct 28 12:27:07 r500 kernel: [419] spa_sync: No writable vdev for prot1. Oct 28 12:27:08 r500 kernel: [420] spa_sync: No writable vdev for prot1. Oct 28 12:27:09 r500 kernel: [420] umass0: on usbus1 Oct 28 12:27:09 r500 kernel: [420] umass0: SCSI over Bulk-Only; quirks = 0x4101 Oct 28 12:27:09 r500 kernel: [420] umass0:2:0: Attached to scbus2 Oct 28 12:27:09 r500 kernel: [420] da0 at umass-sim0 bus 0 scbus2 target 0 lun 0 Oct 28 12:27:09 r500 kernel: [420] da0: Removable Direct Access SCSI-2 device Oct 28 12:27:09 r500 kernel: [420] da0: Serial Number 22B39DCC Oct 28 12:27:09 r500 kernel: [420] da0: 40.000MB/s transfers Oct 28 12:27:09 r500 kernel: [420] da0: 7782MB (15937536 512 byte sectors) Oct 28 12:27:09 r500 kernel: [420] da0: quirks=0x2 Oct 28 12:27:09 r500 kernel: [421] GEOM_PART: integrity check failed (label/prot1, MBR) Oct 28 12:27:09 r500 kernel: [421] spa_sync: No writable vdev for prot1. Oct 28 12:27:10 r500 kernel: [422] spa_sync: No writable vdev for prot1. Oct 28 12:27:11 r500 kernel: [423] spa_sync: No writable vdev for prot1. Oct 28 12:27:12 r500 kernel: [423] GEOM_ELI: Device label/prot1.eli created. Oct 28 12:27:12 r500 kernel: [423] GEOM_ELI: Encryption: AES-XTS 256 Oct 28 12:27:12 r500 kernel: [423] GEOM_ELI: Crypto: software Oct 28 12:27:12 r500 ZFS: vdev state changed, pool_guid=2852801396507945726 vdev_guid=18325185749120546863 Oct 28 12:27:12 r500 kernel: [424] spa_sync: vdev for prot1 became writable again. Device loss was triggered and fixed with: sudo usbconfig -d 1.2 reset && zogftw import prot1 && sudo zpool clear prot1 This commit is experimental! It should be a nop unless the vfs.zfs.spa_sync_wait_for_writable_vdev sysctl is set, though. Setting the sysctl merely shrinks the race window and thus does not completely solve the problem. The problem can still be reproduced with an artifical test case like: Shell 1: mdconfig -u 0 -f /dpool/scratch/test-vdev.img zpool create test /dev/md0 while sleep 1; do mdconfig -d -u 0 -o force && mdconfig -f /dpool/scratch/test-vdev.img && zpool clear test; done Shell 2: # Cause writes to the pool from another shell, for example # by creating datasets. Log excerpt (from test begin to deadlock): Oct 29 12:34:28 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:44:42 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:45:04 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:43 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:43 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:44 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:44 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:45 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:45 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:46 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:46 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:47 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:47 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:48 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:48 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:49 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:49 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:51 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:51 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:52 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:52 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:53 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:54 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:54 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:55 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:55 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:56 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:56 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:57 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:57 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:59 kendra ZFS: vdev is removed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:46:59 kendra ZFS: vdev state changed, pool_guid=16039353738236808887 vdev_guid=3080051161477470469 Oct 29 12:47:00 kendra kernel: g_dev_taste: make_dev_p() failed (gp->name=md0, error=17) With the deadman enabled, this will also cause: panic: I/O to pool 'test' appears to be hung on vdev guid 3080051161477470469 at '/dev/md0'. cpuid = 0 KDB: stack backtrace: db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe01136af870 vpanic() at vpanic+0x182/frame 0xfffffe01136af8f0 panic() at panic+0x43/frame 0xfffffe01136af950 vdev_deadman() at vdev_deadman+0x127/frame 0xfffffe01136af9a0 vdev_deadman() at vdev_deadman+0x40/frame 0xfffffe01136af9f0 spa_deadman() at spa_deadman+0x86/frame 0xfffffe01136afa20 softclock_call_cc() at softclock_call_cc+0x1a3/frame 0xfffffe01136afaf0 softclock() at softclock+0x94/frame 0xfffffe01136afb20 intr_event_execute_handlers() at intr_event_execute_handlers+0x1b6/frame 0xfffffe01136afb60 ithread_loop() at ithread_loop+0xa6/frame 0xfffffe01136afbb0 fork_exit() at fork_exit+0x9c/frame 0xfffffe01136afbf0 fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe01136afbf0 With test's txg_sync_thread being the offender: (kgdb) tid 101874 [Switching to thread 819 (Thread 101874)]#0 sched_switch (td=0xfffff800513649a0, newtd=, flags=) at /usr/src/sys/kern/sched_ule.c:1969 1969 cpuid = PCPU_GET(cpuid); (kgdb) where #0 sched_switch (td=0xfffff800513649a0, newtd=, flags=) at /usr/src/sys/kern/sched_ule.c:1969 #1 0xffffffff805a3a18 in mi_switch (flags=260, newtd=0x0) at /usr/src/sys/kern/kern_synch.c:470 #2 0xffffffff805ea15a in sleepq_wait (wchan=0x0, pri=0) at /usr/src/sys/kern/subr_sleepqueue.c:631 #3 0xffffffff80530509 in _cv_wait (cvp=0xfffff8002678ea98, lock=0xfffff8002678ea78) at /usr/src/sys/kern/kern_condvar.c:139 #4 0xffffffff81930bbb in zio_wait (zio=) at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c:1535 #5 0xffffffff818e4871 in dsl_pool_sync (dp=0xfffff80047dfd000, txg=76) at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c:540 #6 0xffffffff81903653 in spa_sync (spa=0xfffff8009dfe2000, txg=76) at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c:6734 #7 0xffffffff8190ccfa in txg_sync_thread (arg=0xfffff80047dfd000) at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c:517 #8 0xffffffff80556edc in fork_exit (callout=0xffffffff8190c970 , arg=0xfffff80047dfd000, frame=0xfffffe011c27bc00) at /usr/src/sys/kern/kern_fork.c:1011 #9 0xffffffff8085b91e in fork_trampoline () at /usr/src/sys/amd64/amd64/exception.S:609 #10 0x0000000000000000 in ?? () (kgdb) f 6 #6 0xffffffff81903653 in spa_sync (spa=0xfffff8009dfe2000, txg=76) at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c:6734 (kgdb) p spa->spa_name $3 = 0xfffff8009dfe2000 "test" --- .../contrib/opensolaris/uts/common/fs/zfs/spa.c | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index cf9112b..a2101ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -6588,6 +6588,31 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) rrw_exit(&dp->dp_config_rwlock, FTAG); } + +/* + * Check if there's at least one (potentionally) writable vdev. + */ +int +spa_has_writable_vdev(spa_t *spa) { + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd; + int i; + + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (!vd->vdev_cant_write) + return 1; + } + return 0; +} + +static int spa_sync_wait_for_writable_vdev = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_sync_wait_for_writable_vdev, CTLFLAG_RWTUN, + &spa_sync_wait_for_writable_vdev, 0, + "Let spa_sync() pause if no writable vdev is available. Experimental!"); + +static int spa_sync_deadlock_loops = 0; + /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. @@ -6683,6 +6708,29 @@ spa_sync(spa_t *spa, uint64_t txg) spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); + if (spa_sync_wait_for_writable_vdev) { + int waited_for_vdev = 0; + + while (!spa_has_writable_vdev(spa)) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + + waited_for_vdev = 1; + printf("%s: No writable vdev for %s.\n", + __func__, spa_name(spa)); +#ifdef _KERNEL + pause("spa_sync", hz); +#else + sleep(1); +#endif + spa_sync_deadlock_loops++; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + } + if (waited_for_vdev) { + printf("%s: vdev for %s became writable again.\n", + __func__, spa_name(spa)); + } + } dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free) { -- 2.6.2