From ed06783324cc88cf4d13d7e2bfd588c1a1701601 Mon Sep 17 00:00:00 2001 From: Fabian Keil Date: Fri, 17 Jul 2015 13:11:39 +0200 Subject: [PATCH 216/257] Let the ZFS ARC behave better under memory pressure Original author: Karl Denninger Source: https://bz-attachments.freebsd.org/attachment.cgi?id=152852&action=diff&collapsed=&context=patch&format=raw&headers=1 PR: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=187594 Commit adjusted to compile on i386 and to deal with merge conflicts with (at least): r263620/6fcf6199a: "Rename global cnt to vm_cnt to avoid shadowing" r286625/78648874e: "5376 arc_kmem_reap_now() should not result in clearing arc_no_grow" r286763/71fb6300f: "5497 lock contention on arcs_mtx" No functional change intended. --- .../contrib/opensolaris/uts/common/fs/zfs/arc.c | 82 +++++++++++++++++----- .../contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c | 47 +++++++++++-- .../opensolaris/uts/common/fs/zfs/dsl_pool.c | 7 +- .../opensolaris/uts/common/fs/zfs/sys/dsl_pool.h | 1 + .../contrib/opensolaris/uts/common/fs/zfs/zio.c | 4 +- 5 files changed, 116 insertions(+), 25 deletions(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 6e9a624513cf..613a4ebdf1d1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -341,6 +341,15 @@ static int arc_dead; extern boolean_t zfs_prefetch_disable; /* + * KD 2015-02-10 + * We have to be able to test for UIO use inside the arc allocator. + * NOTE: DO NOT MODIFY HERE! + */ +extern int zio_use_uma; +extern int zfs_dynamic_write_buffer; + + +/* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; @@ -373,7 +382,7 @@ static void arc_free_target_init(void *unused __unused) { - zfs_arc_free_target = vm_pageout_wakeup_thresh; + zfs_arc_free_target = vm_pageout_wakeup_thresh + ((vm_cnt.v_free_target - vm_pageout_wakeup_thresh) / 2); } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); @@ -394,6 +403,9 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, "log2(fraction of arc to reclaim)"); SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, dynamic_write_buffer, CTLFLAG_RWTUN, + &zfs_dynamic_write_buffer, 0, + "Dynamically restrict dirty data when memory is low"); /* * We don't have a tunable for arc_free_target due to the dependency on @@ -4011,13 +4023,32 @@ extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; -static __noinline void -arc_kmem_reap_now(void) +static void __used +reap_arc_caches() { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); + kmem_cache_reap_now(range_seg_cache); +} + +static __noinline void +arc_kmem_reap_now(void) +{ DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { @@ -4035,20 +4066,7 @@ arc_kmem_reap_now(void) #endif #endif - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { - if (zio_buf_cache[i] != prev_cache) { - prev_cache = zio_buf_cache[i]; - kmem_cache_reap_now(zio_buf_cache[i]); - } - if (zio_data_buf_cache[i] != prev_data_cache) { - prev_data_cache = zio_data_buf_cache[i]; - kmem_cache_reap_now(zio_data_buf_cache[i]); - } - } - kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_full_cache); - kmem_cache_reap_now(hdr_l2only_cache); - kmem_cache_reap_now(range_seg_cache); + reap_arc_caches(); #ifdef illumos if (zio_arena != NULL) { @@ -4083,11 +4101,28 @@ arc_reclaim_thread(void *dummy __unused) { hrtime_t growtime = 0; callb_cpr_t cpr; + int autoreap = 0; CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_lock); while (!arc_reclaim_thread_exit) { +#ifdef _KERNEL +/* KD 2015-02-10 + * Protect against UMA free memory bloat. We already do this on a low-memory + * basis in the allocator; it has to happen there rather than here due to + * response time considerations. Make the call here once every 10 passes as + * well; this reclaims unused UMA buffers every 10 seconds on an idle system + * and more frequently if the reclaim thread gets woken up by low RAM + * conditions. + */ + if ((zio_use_uma) && (autoreap++ == 10)) { + autoreap = 0; + DTRACE_PROBE(arc__reclaim_timed_reap); + reap_arc_caches(); + } +#endif /* _KERNEL */ + int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; @@ -4373,6 +4408,19 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); +#ifdef _KERNEL +/* KD 2015-02-10 + * It would be nice if we could leave this to the arc_reclaim thread. + * Unfortunately we cannot; the test has to be done here as well, because + * under heavy I/O demand we can grab enough RAM fast enough to induce + * nasty oscillation problems. Fortunately we only need to call this when + * the system is under reasonably-severe memory stress. + */ + if (zio_use_uma && (ptob(vm_cnt.v_free_count) + size < ptob(vm_cnt.v_free_target))) { + DTRACE_PROBE3(arc__alloc_lowmem_reap, int, vm_cnt.v_free_count, int, size, int, vm_cnt.v_free_target); + reap_arc_caches(); + } +#endif /* _KERNEL */ datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 68381488b38c..5bcc2cec49e3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -43,6 +43,8 @@ typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); +extern int zio_use_uma; /* Needs to be visible; DO NOT MODIFY! */ +int zfs_dynamic_write_buffer = 1; /* Dynamically tune writes */ dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) @@ -1106,7 +1108,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = - zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100; hrtime_t wakeup, min_tx_time, now; if (dirty <= delay_min_bytes) @@ -1118,11 +1120,11 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) * have to handle the case of it being >= the max, which could * cause a divide-by-zero if it's == the max. */ - ASSERT3U(dirty, <, zfs_dirty_data_max); + ASSERT3U(dirty, <, zfs_dirty_data_max_internal); now = gethrtime(); min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + (dirty - delay_min_bytes) / (zfs_dirty_data_max_internal - dirty); if (now > tx->tx_start + min_tx_time) return; @@ -1327,6 +1329,7 @@ int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { int err; + static uint64_t last_max; ASSERT(tx->tx_txg == 0); ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || @@ -1339,6 +1342,42 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) if (txg_how == TXG_WAITED) tx->tx_waited = B_TRUE; +#ifdef _KERNEL + /* + * KD 2014-09-22 + * If UMA is enabled it can only return a previously-used block + * of identical size to what it had out before. If it's not the + * same size it will allocate a new one. This is a problem because + * dirty_data_max is the total dirty write data allowed out at any + * given time, but with UMA on that can multiply by the number of + * different block sizes (!!) requested in terms of free RAM that + * is left allocated but unused. For this reason never allow + * dirty_data_max to exceed the difference between the paging + * threshold and the current free memory, with a minimum of 256MB. + * This throttles "burst" allocations and prevents the system from + * choking during times of high write I/O demand. + * + * We allow this to be turned off if you want with + * "vfs.zfs_dynamic_write_buffer=0", which can be done in real time. + * + * Note that we work on the zfs_dirty_data_max_internal variable, + * because the user may set zfs_dirty_data_max himself and we must + * must honor that as a hard cap so it remains a usable tunable value. + */ + if (zio_use_uma & zfs_dynamic_write_buffer) { + zfs_dirty_data_max_internal = 1 << 28; + zfs_dirty_data_max_internal = MAX(zfs_dirty_data_max_internal, ptob(vm_cnt.v_free_count - vm_cnt.v_free_target)); + zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max); + zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max_max); + if (last_max != (zfs_dirty_data_max_internal / (1024 * 1024))) { + last_max = zfs_dirty_data_max_internal / (1024 * 1024); + DTRACE_PROBE1(dmu__tx_dirty, uint64_t, last_max); + } + } else { + zfs_dirty_data_max_internal = zfs_dirty_data_max; + } +#endif /* _KERNEL */ + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1369,7 +1408,7 @@ dmu_tx_wait(dmu_tx_t *tx) * space. */ mutex_enter(&dp->dp_lock); - while (dp->dp_dirty_total >= zfs_dirty_data_max) + while (dp->dp_dirty_total >= zfs_dirty_data_max_internal) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index 9b3b79bfb517..33820f76e7b3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -99,8 +99,11 @@ /* * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. + * We also dynamically tune during low memory, honoring the sysctl set, so + * internal comparisons are against zfs_dirty_data_max_internal. */ uint64_t zfs_dirty_data_max; +uint64_t zfs_dirty_data_max_internal; uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; int zfs_dirty_data_max_percent = 10; @@ -548,7 +551,7 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) * Note: we signal even when increasing dp_dirty_total. * This ensures forward progress -- each thread wakes the next waiter. */ - if (dp->dp_dirty_total <= zfs_dirty_data_max) + if (dp->dp_dirty_total <= zfs_dirty_data_max_internal) cv_signal(&dp->dp_spaceavail_cv); } @@ -730,7 +733,7 @@ boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = - zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100; boolean_t rv; mutex_enter(&dp->dp_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h index 0e27a538208c..ee97b5719cd2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -50,6 +50,7 @@ struct dmu_tx; struct dsl_scan; extern uint64_t zfs_dirty_data_max; +extern uint64_t zfs_dirty_data_max_internal; extern uint64_t zfs_dirty_data_max_max; extern uint64_t zfs_dirty_data_sync; extern int zfs_dirty_data_max_percent; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index f8907c1e6872..d5ab54271619 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -46,9 +46,9 @@ SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); #if defined(__amd64__) -static int zio_use_uma = 1; +int zio_use_uma = 1; #else -static int zio_use_uma = 0; +int zio_use_uma = 0; #endif SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, "Use uma(9) for ZIO allocations"); -- 2.11.0