Skip to content

Commit 3ab4934

Browse files
MDEV-36226 Stall and crash when page cleaner fails to generate free pages during Async flush
During regular iteration the page cleaner does flush from flush list with some flush target and then goes for generating free pages from LRU tail. When asynchronous flush is triggered i.e. when 7/8 th of the LSN margin is filled in the redo log, the flush target for flush list is set to innodb_io_capacity_max. If it could flush all, the flush bandwidth for LRU flush is currently set to zero. If the LRU tail has dirty pages, page cleaner ends up freeing no pages in one iteration. The scenario could repeat across multiple iterations till async flush target is reached. During this time the DB system is starved of free pages resulting in apparent stall and in some cases dict_sys latch fatal error. Fix: In page cleaner iteration, before LRU flush, ensure we provide enough flush limit so that freeing pages is no blocked by dirty pages in LRU tail. Log IO and flush state if double write flush wait is long. Reviewed by: Marko Mäkelä
1 parent 0e8e006 commit 3ab4934

File tree

9 files changed

+142
-8
lines changed

9 files changed

+142
-8
lines changed

mysql-test/mariadb-test-run.pl

+1
Original file line numberDiff line numberDiff line change
@@ -4464,6 +4464,7 @@ ($$)
44644464
qr/InnoDB: innodb_open_files .* should not be greater than/,
44654465
qr/InnoDB: Trying to delete tablespace.*but there are.*pending/,
44664466
qr/InnoDB: Tablespace 1[0-9]* was not found at .*, and innodb_force_recovery was set/,
4467+
qr/InnoDB: Long wait \([0-9]+ seconds\) for double-write buffer flush/,
44674468
qr/Slave: Unknown table 't1' .* 1051/,
44684469
qr/Slave SQL:.*(Internal MariaDB error code: [[:digit:]]+|Query:.*)/,
44694470
qr/slave SQL thread aborted/,

storage/innobase/buf/buf0dblwr.cc

+49-3
Original file line numberDiff line numberDiff line change
@@ -602,20 +602,66 @@ static void buf_dblwr_check_block(const buf_page_t *bpage) noexcept
602602
}
603603
#endif /* UNIV_DEBUG */
604604

605+
ATTRIBUTE_COLD void buf_dblwr_t::print_info() const noexcept
606+
{
607+
mysql_mutex_assert_owner(&mutex);
608+
const slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
609+
610+
sql_print_information("Innodb: Double Write State\n"
611+
"-------------------\n"
612+
"Batch running : %s\n"
613+
"Active Slot - first_free: %lu reserved: %lu\n"
614+
"Flush Slot - first_free: %lu reserved: %lu\n"
615+
"-------------------",
616+
(batch_running ? "true" : "false"),
617+
active_slot->first_free, active_slot->reserved,
618+
flush_slot->first_free, flush_slot->reserved);
619+
}
620+
605621
bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept
606622
{
607623
mysql_mutex_assert_owner(&mutex);
608624
ut_ad(size == block_size());
609625

610-
for (;;)
626+
const ulong max_count= 60 * 60;
627+
const ulong first_log_count= 30;
628+
const ulong fatal_threshold= srv_fatal_semaphore_wait_threshold;
629+
ulong log_count= first_log_count;
630+
631+
for (ulong count= 0;;)
611632
{
612633
if (!active_slot->first_free)
613634
return false;
614635
if (!batch_running)
615636
break;
616-
my_cond_wait(&cond, &mutex.m_mutex);
617-
}
618637

638+
timespec abstime;
639+
set_timespec(abstime, 1);
640+
my_cond_timedwait(&cond, &mutex.m_mutex, &abstime);
641+
642+
if (count > fatal_threshold)
643+
{
644+
buf_pool.print_flush_info();
645+
print_info();
646+
ib::fatal() << "Innodb: Long wait (" << count
647+
<< " seconds) for double-write buffer flush.";
648+
}
649+
else if (++count < first_log_count && !(count % 5))
650+
{
651+
sql_print_information("Innodb: Long wait (%lu seconds) for double-write"
652+
" buffer flush.", count);
653+
buf_pool.print_flush_info();
654+
print_info();
655+
}
656+
else if (!(count % log_count))
657+
{
658+
sql_print_warning("Innodb: Long wait (%lu seconds) for double-write"
659+
" buffer flush.", count);
660+
buf_pool.print_flush_info();
661+
print_info();
662+
log_count= log_count >= max_count ? max_count : log_count * 2;
663+
}
664+
}
619665
ut_ad(active_slot->reserved == active_slot->first_free);
620666
ut_ad(!flushing_buffered_writes);
621667

storage/innobase/buf/buf0flu.cc

+73-5
Original file line numberDiff line numberDiff line change
@@ -1374,7 +1374,10 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) noexcept
13741374
break;
13751375
}
13761376

1377-
if (neighbors && space->is_rotational())
1377+
if (neighbors && space->is_rotational() &&
1378+
/* Skip neighbourhood flush from LRU list if we haven't yet reached
1379+
half of the free page target. */
1380+
UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit)
13781381
n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
13791382
neighbors == 1,
13801383
n->flushed, max);
@@ -1763,8 +1766,16 @@ static ulint buf_flush_LRU(ulint max_n) noexcept
17631766
buf_do_LRU_batch(max_n, &n);
17641767

17651768
ulint pages= n.flushed;
1769+
ulint evicted= n.evicted;
17661770

1767-
if (n.evicted)
1771+
/* If we have exhausted flush quota, it is likely we exited before
1772+
generating enough free pages. Call once more with 0 flush to generate
1773+
free pages immediately as required. */
1774+
if (pages == max_n)
1775+
buf_do_LRU_batch(0, &n);
1776+
1777+
evicted+= n.evicted;
1778+
if (evicted)
17681779
{
17691780
buf_pool.try_LRU_scan= true;
17701781
pthread_cond_broadcast(&buf_pool.done_free);
@@ -2331,6 +2342,11 @@ static void buf_flush_page_cleaner() noexcept
23312342
DBUG_EXECUTE_IF("ib_page_cleaner_sleep",
23322343
{
23332344
std::this_thread::sleep_for(std::chrono::seconds(1));
2345+
/* Cover the logging code in debug mode. */
2346+
buf_pool.print_flush_info();
2347+
buf_dblwr.lock();
2348+
buf_dblwr.print_info();
2349+
buf_dblwr.unlock();
23342350
});
23352351
lsn_limit= buf_flush_sync_lsn;
23362352

@@ -2535,6 +2551,10 @@ static void buf_flush_page_cleaner() noexcept
25352551

25362552
n= srv_max_io_capacity;
25372553
n= n >= n_flushed ? n - n_flushed : 0;
2554+
/* It is critical to generate free pages to keep the system alive. Make
2555+
sure we are not hindered by dirty pages in LRU tail. */
2556+
n= std::max<ulint>(n, std::min<ulint>(srv_max_io_capacity,
2557+
buf_pool.LRU_scan_depth));
25382558
goto LRU_flush;
25392559
}
25402560

@@ -2573,10 +2593,11 @@ ATTRIBUTE_COLD void buf_pool_t::LRU_warn() noexcept
25732593
mysql_mutex_assert_owner(&mutex);
25742594
try_LRU_scan= false;
25752595
if (!LRU_warned.test_and_set(std::memory_order_acquire))
2596+
{
25762597
sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!"
2577-
" %zu blocks are in use and %zu free."
2578-
" Consider increasing innodb_buffer_pool_size.",
2579-
UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free));
2598+
" Consider increasing innodb_buffer_pool_size.");
2599+
buf_pool.print_flush_info();
2600+
}
25802601
}
25812602

25822603
/** Initialize page_cleaner. */
@@ -2656,6 +2677,53 @@ void buf_flush_sync() noexcept
26562677
thd_wait_end(nullptr);
26572678
}
26582679

2680+
ATTRIBUTE_COLD void buf_pool_t::print_flush_info() const noexcept
2681+
{
2682+
/* We do dirty read of UT_LIST count variable. */
2683+
ulint lru_size= UT_LIST_GET_LEN(LRU);
2684+
ulint dirty_size= UT_LIST_GET_LEN(flush_list);
2685+
ulint free_size= UT_LIST_GET_LEN(free);
2686+
ulint dirty_pct= lru_size ? dirty_size * 100 / (lru_size + free_size) : 0;
2687+
sql_print_information("Innodb: Buffer Pool pages \n"
2688+
"-------------------\n"
2689+
"LRU Pages: %lu\n"
2690+
"Free Pages: %lu\n"
2691+
"Dirty Pages: %lu : %lu%%\n"
2692+
"-------------------",
2693+
lru_size, free_size, dirty_size, dirty_pct);
2694+
2695+
lsn_t lsn= log_sys.get_lsn();
2696+
lsn_t clsn= log_sys.last_checkpoint_lsn;
2697+
sql_print_information("Innodb: LSN flush parameters\n"
2698+
"-------------------\n"
2699+
"System LSN : %" PRIu64 "\n"
2700+
"Checkpoint LSN: %" PRIu64 "\n"
2701+
"Flush ASync LSN: %" PRIu64 "\n"
2702+
"Flush Sync LSN: %" PRIu64 "\n"
2703+
"-------------------",
2704+
lsn, clsn, buf_flush_async_lsn.load(), buf_flush_sync_lsn.load());
2705+
2706+
lsn_t age= lsn - clsn;
2707+
lsn_t age_pct= log_sys.max_checkpoint_age
2708+
? age * 100 / log_sys.max_checkpoint_age : 0;
2709+
sql_print_information("Innodb: LSN age parameters\n"
2710+
"-------------------\n"
2711+
"Current Age : %" PRIu64 " : %" PRIu64 "%%\n"
2712+
"Max Age(Async): %" PRIu64 "\n"
2713+
"Max Age(Sync) : %" PRIu64 "\n"
2714+
"Capacity : %" PRIu64 "\n"
2715+
"-------------------",
2716+
age, age_pct, log_sys.max_modified_age_async, log_sys.max_checkpoint_age,
2717+
log_sys.log_capacity);
2718+
2719+
sql_print_information("Innodb: Pending IO count\n"
2720+
"-------------------\n"
2721+
"Pending Read: %zu\n"
2722+
"Pending Write: %zu\n"
2723+
"-------------------",
2724+
os_aio_pending_reads_approx(), os_aio_pending_writes_approx());
2725+
}
2726+
26592727
#ifdef UNIV_DEBUG
26602728
/** Functor to validate the flush list. */
26612729
struct Check {

storage/innobase/dict/dict0dict.cc

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Created 1/8/1996 Heikki Tuuri
4444
#include "btr0cur.h"
4545
#include "btr0sea.h"
4646
#include "buf0buf.h"
47+
#include "buf0flu.h"
4748
#include "data0type.h"
4849
#include "dict0boot.h"
4950
#include "dict0load.h"
@@ -1012,7 +1013,10 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line)) noexc
10121013
const ulong threshold= srv_fatal_semaphore_wait_threshold;
10131014

10141015
if (waited >= threshold)
1016+
{
1017+
buf_pool.print_flush_info();
10151018
ib::fatal() << fatal_msg;
1019+
}
10161020

10171021
if (waited > threshold / 4)
10181022
ib::warn() << "A long wait (" << waited

storage/innobase/include/buf0buf.h

+3
Original file line numberDiff line numberDiff line change
@@ -1951,6 +1951,9 @@ class buf_pool_t
19511951
/** Issue a warning that we could not free up buffer pool pages. */
19521952
ATTRIBUTE_COLD void LRU_warn() noexcept;
19531953

1954+
/** Print buffer pool flush state information. */
1955+
ATTRIBUTE_COLD void print_flush_info() const noexcept;
1956+
19541957
private:
19551958
/** Temporary memory for page_compressed and encrypted I/O */
19561959
struct io_buf_t

storage/innobase/include/buf0dblwr.h

+3
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ class buf_dblwr_t
159159
my_cond_wait(&cond, &mutex.m_mutex);
160160
mysql_mutex_unlock(&mutex);
161161
}
162+
163+
/** Print double write state information. */
164+
ATTRIBUTE_COLD void print_info() const noexcept;
162165
};
163166

164167
/** The doublewrite buffer */

storage/innobase/include/os0file.h

+2
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,8 @@ size_t os_aio_pending_reads() noexcept;
10211021
size_t os_aio_pending_reads_approx() noexcept;
10221022
/** @return number of pending writes */
10231023
size_t os_aio_pending_writes() noexcept;
1024+
/** @return approximate number of pending writes */
1025+
size_t os_aio_pending_writes_approx() noexcept;
10241026

10251027
/** Wait until there are no pending asynchronous writes.
10261028
@param declare whether the wait will be declared in tpool */

storage/innobase/os/os0file.cc

+6
Original file line numberDiff line numberDiff line change
@@ -3481,6 +3481,12 @@ size_t os_aio_pending_writes() noexcept
34813481
return pending;
34823482
}
34833483

3484+
/** @return approximate number of pending writes */
3485+
size_t os_aio_pending_writes_approx() noexcept
3486+
{
3487+
return write_slots->pending_io_count();
3488+
}
3489+
34843490
/** Wait until all pending asynchronous reads have completed.
34853491
@param declare whether the wait will be declared in tpool */
34863492
void os_aio_wait_until_no_pending_reads(bool declare) noexcept

storage/innobase/srv/srv0srv.cc

+1
Original file line numberDiff line numberDiff line change
@@ -1220,6 +1220,7 @@ void srv_monitor_task(void*)
12201220
now -= start;
12211221
ulong waited = static_cast<ulong>(now / 1000000);
12221222
if (waited >= threshold) {
1223+
buf_pool.print_flush_info();
12231224
ib::fatal() << dict_sys.fatal_msg;
12241225
}
12251226

0 commit comments

Comments
 (0)