Skip to content

Commit

Permalink
commitlog: Add optional use of O_DSYNC mode
Browse files Browse the repository at this point in the history
Refs #3929

Optionally enables O_DSYNC mode for segment files, and when
enabled ignores actual flushing and just barriers any ongoing
writes.

Iff using O_DSYNC mode, we will not only truncate the file
to max size, but also do an actual initial write of zero:s
to it, since XFS (intended target) has observably less good
behaviour on non-physical file blocks. Once written (and maybe
recycled) we should have rather satisfying throughput on writes.

Note that the O_DSYNC behaviour is hidden behind a default
disabled option. While user should probably seldom worry about
this, we should add some sort of logic i main/init that unless
specified by user, evaluates the commitlog disk and sets this
to true if it is using XFS and looks ok. This is because using
O_DSYNC on things like EXT4 etc has quite horrible performance.

All above statements about performance and O_DSYNC behaviour
are based on a sampling of benchmark results (modified fsqual)
on a statistically non-ssignificant selection of disks. However,
at least there the observed behaviour is a rather large
difference between ::fallocate:ed disk area vs. actually written
using O_DSYNC on XFS, and O_DSYNC on EXT4.

Note also that measurements on O_DSYNC vs. no O_DSYNC does not
take into account the wall-clock time of doing manual disk flush.
This is intentionally ignored, since in the commitlog case, at
least using periodic mode, flushes are relatively rare.

Message-Id: <20190520120331.10229-1-calle@scylladb.com>
  • Loading branch information
Calle Wilund authored and avikivity committed May 20, 2019
1 parent d92973b commit 1e37e1d
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 9 deletions.
60 changes: 51 additions & 9 deletions db/commitlog/commitlog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ db::commitlog::config db::commitlog::config::from_db_config(const db::config& cf
c.mode = cfg.commitlog_sync() == "batch" ? sync_mode::BATCH : sync_mode::PERIODIC;
c.extensions = &cfg.extensions();
c.reuse_segments = cfg.commitlog_reuse_segments();
c.use_o_dsync = cfg.commitlog_use_o_dsync();

return c;
}
Expand Down Expand Up @@ -331,6 +332,7 @@ class db::commitlog::segment_manager : public ::enable_shared_from_this<segment_
using buffer_type = fragmented_temporary_buffer;

buffer_type acquire_buffer(size_t s);
temporary_buffer<char> allocate_single_buffer(size_t);

future<std::vector<descriptor>> list_descriptors(sstring dir);

Expand Down Expand Up @@ -1258,10 +1260,47 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
return fut;
});

return fut.then([this, d, active, filename](file f) {
return fut.then([this, d, active, filename, flags](file f) {
f = make_checked_file(commit_error_handler, f);
// xfs doesn't like files extended betond eof, so enlarge the file
return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
auto fut = make_ready_future<>();
// If file is opened with O_DSYNC, we should explicitly write zeros
// instead of just truncate/fallocate. Otherwise we get crappy
// behaviour.
if ((flags & open_flags::dsync) != open_flags{}) {
clogger.trace("Pre-writing {}KB to segment {}", max_size/1024, filename);
// would be super nice if we just could mmap(/dev/zero) and do sendto
// instead of this, but for now we must do explicit buffer writes.
fut = f.allocate(0, max_size).then([f, this]() mutable {
static constexpr size_t buf_size = 4 * segment::alignment;
return do_with(allocate_single_buffer(buf_size), max_size, [this, f](temporary_buffer<char>& buf, uint64_t& rem) mutable {
std::fill(buf.get_write(), buf.get_write() + buf.size(), 0);
return repeat([this, f, &rem, &buf]() mutable {
if (rem == 0) {
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
static constexpr size_t max_write = 128 * 1024;
auto n = std::min(max_write / buf_size, 1 + rem / buf_size);

std::vector<iovec> v;
v.reserve(n);
size_t m = 0;
while (m < rem && n < max_write) {
auto s = std::min(rem - m, buf_size);
v.emplace_back(iovec{ buf.get_write(), s});
m += s;
}
return f.dma_write(max_size - rem, std::move(v)).then([&rem](size_t s) {
rem -= s;
return stop_iteration::no;
});
});
});
});
} else {
fut = f.truncate(max_size);
}
return fut.then([this, d, active, f, filename] () mutable {
auto s = make_shared<segment>(this->shared_from_this(), d, std::move(f), active);
return make_ready_future<sseg_ptr>(s);
});
Expand All @@ -1279,6 +1318,10 @@ future<> db::commitlog::segment_manager::rename_file(sstring from, sstring to) c
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
descriptor d(next_id(), cfg.fname_prefix);
auto dst = filename(d);
auto flags = open_flags::wo;
if (cfg.use_o_dsync) {
flags |= open_flags::dsync;
}

if (!_recycled_segments.empty()) {
auto src = std::move(_recycled_segments.front());
Expand All @@ -1289,11 +1332,11 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
// out-of-order files. (Sort does not help).
clogger.debug("Using recycled segment file {} -> {}", src, dst);
return rename_file(std::move(src), dst).then([=] {
return allocate_segment_ex(d, dst, open_flags::wo, false);
return allocate_segment_ex(d, dst, flags, false);
});
}

return allocate_segment_ex(d, dst, open_flags::wo|open_flags::create, active);
return allocate_segment_ex(d, dst, flags|open_flags::create, active);
}

future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
Expand Down Expand Up @@ -1611,6 +1654,9 @@ uint64_t db::commitlog::segment_manager::get_num_active_segments() const {
});
}

temporary_buffer<char> db::commitlog::segment_manager::allocate_single_buffer(size_t s) {
return temporary_buffer<char>::aligned(segment::alignment, s);
}

db::commitlog::segment_manager::buffer_type db::commitlog::segment_manager::acquire_buffer(size_t s) {
s = align_up(s, segment::default_size);
Expand All @@ -1619,11 +1665,7 @@ db::commitlog::segment_manager::buffer_type db::commitlog::segment_manager::acqu
std::vector<temporary_buffer<char>> buffers;
buffers.reserve(fragment_count);
while (buffers.size() < fragment_count) {
auto a = ::memalign(segment::alignment, segment::default_size);
if (a == nullptr) {
throw std::bad_alloc();
}
buffers.emplace_back(static_cast<char*>(a), segment::default_size, make_free_deleter(a));
buffers.emplace_back(allocate_single_buffer(segment::default_size));
}
clogger.trace("Allocated {} k buffer", s / 1024);
return fragmented_temporary_buffer(std::move(buffers), s);
Expand Down
1 change: 1 addition & 0 deletions db/commitlog/commitlog.hh
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ public:
std::string fname_prefix = descriptor::FILENAME_PREFIX;

bool reuse_segments = true;
bool use_o_dsync = false;

const db::extensions * extensions = nullptr;
};
Expand Down
2 changes: 2 additions & 0 deletions db/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
"Related information: Configuring memtable throughput")
, commitlog_reuse_segments(this, "commitlog_reuse_segments", value_status::Used, true,
"Whether or not to re-use commitlog segments when finished instead of deleting them. Can improve commitlog latency on some file systems.\n")
, commitlog_use_o_dsync(this, "commitlog_use_o_dsync", value_status::Used, true,
"Whether or not to use O_DSYNC mode for commitlog segments IO. Can improve commitlog latency on some file systems.\n")
/* Compaction settings */
/* Related information: Configuring compaction */
, compaction_preheat_key_cache(this, "compaction_preheat_key_cache", value_status::Unused, true,
Expand Down
1 change: 1 addition & 0 deletions db/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ public:
named_value<uint32_t> commitlog_sync_batch_window_in_ms;
named_value<int64_t> commitlog_total_space_in_mb;
named_value<bool> commitlog_reuse_segments;
named_value<bool> commitlog_use_o_dsync;
named_value<bool> compaction_preheat_key_cache;
named_value<uint32_t> concurrent_compactors;
named_value<uint32_t> in_memory_compaction_limit_in_mb;
Expand Down

0 comments on commit 1e37e1d

Please # to comment.